In [3]:
import time
from Bio import Entrez
import pandas as pd
from tqdm import tqdm

In [10]:
summary[:5]

[{'Item': [], 'Id': '308204622', 'Accession': 'GSM8204622', 'GDS': '', 'title': 'SS2_Exp1_WT_Bystander_I20_B000997_GRP1P1NEG_S196', 'summary': 'Lung', 'GPL': '24247', 'GSE': '263891;263892', 'taxon': 'Mus musculus', 'entryType': 'GSM', 'gdsType': '', 'ptechType': '', 'valType': '', 'SSInfo': '', 'subsetInfo': '', 'PDAT': '2024/04/20', 'suppFile': '', 'Samples': [], 'Relations': [], 'ExtRelations': [{'RelationType': 'SRA', 'TargetObject': 'SRX24142450', 'TargetFTPLink': 'ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX241/SRX24142450/'}], 'n_samples': IntegerElement(0, attributes={}), 'SeriesTitle': '', 'PlatformTitle': '', 'PlatformTaxa': '', 'SamplesTaxa': '', 'PubMedIds': [], 'Projects': [], 'FTPLink': '', 'GEO2R': ''},
 {'Item': [], 'Id': '308204621', 'Accession': 'GSM8204621', 'GDS': '', 'title': 'SS2_Exp1_WT_Bystander_D1_B000996_GRP1P3NEG_S41', 'summary': 'Lung', 'GPL': '24247', 'GSE': '263891;263892', 'taxon': 'Mus musculus', 'entryType': 'GSM', 'gdsType':

In [9]:
Entrez.email = "salimovdan1988@gmail.com"  

with open("UID_list.txt") as f:
    uid_list = [line.strip() for line in f if line.strip().isdigit()]

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

exp_list = []

for batch in tqdm(list(chunks(uid_list, 200)), desc="Fetching GSEs", unit="batch"):
    try:
        handle = Entrez.esummary(db="gds", id=",".join(batch))
        summary = Entrez.read(handle)
        for doc in summary:
            exp_list.append({'GSM_id': doc["Accession"], 'GSE_id': doc['GSE']})
    except Exception as e:
        print(f"[X] Ошибка в батче: {e}")
    time.sleep(0.2) 

Fetching GSEs:  16%|███▌                  | 200/1253 [05:00<26:24,  1.50s/batch]

KeyboardInterrupt



In [None]:
# === Шаг 4: GSE → GSM ===
print("🔄 Поиск GSM по каждому GSE...")
results = []

for gse in tqdm(gse_list, desc="Processing GSEs", unit="GSE"):
    try:
        # Получаем UID GSM, связанных с этим GSE
        link_handle = Entrez.elink(dbfrom="gds", db="gds", id=gse, linkname="gds_gds_gsm")
        link_result = Entrez.read(link_handle)
        linksets = link_result[0]["LinkSetDb"]
        if not linksets:
            continue
        gsm_uids = [link["Id"] for link in linksets[0]["Link"]]
        
        # Преобразуем UID GSM → Accession
        for gsm_batch in chunks(gsm_uids, 200):
            gsm_handle = Entrez.esummary(db="gds", id=",".join(gsm_batch))
            gsm_summary = Entrez.read(gsm_handle)
            for gsm_doc in gsm_summary["DocumentSummarySet"]["DocumentSummary"]:
                results.append({"GSE": gse, "GSM": gsm_doc["Accession"]})
            time.sleep(0.4)
    except Exception as e:
        print(f"[X] Ошибка при обработке {gse}: {e}")
    time.sleep(0.4)

# === Шаг 5: Сохранение в CSV ===
df = pd.DataFrame(results)
df.to_csv("gse_gsm_table.csv", index=False)
print("✅ Сохранено в gse_gsm_table.csv")