In [8]:
import pandas as pd

annot_path = "../data/GPL96.annot"

# Step 1: find where the table starts
with open(annot_path, "r", encoding="utf-8", errors="ignore") as f:
    lines = f.readlines()

start_idx = 0
for i, line in enumerate(lines):
    if line.startswith("ID"):  # the start of the table section
        start_idx = i
        break

# Step 2: now read from that line onward
df = pd.read_csv(annot_path, sep="\t", skiprows=start_idx)
print("Columns in file:")
print(df.columns.tolist())

# Display first few rows
df.head()


Columns in file:
['ID', 'Gene title', 'Gene symbol', 'Gene ID', 'UniGene title', 'UniGene symbol', 'UniGene ID', 'Nucleotide Title', 'GI', 'GenBank Accession', 'Platform_CLONEID', 'Platform_ORF', 'Platform_SPOTID', 'Chromosome location', 'Chromosome annotation', 'GO:Function', 'GO:Process', 'GO:Component', 'GO:Function ID', 'GO:Process ID', 'GO:Component ID']


Unnamed: 0,ID,Gene title,Gene symbol,Gene ID,UniGene title,UniGene symbol,UniGene ID,Nucleotide Title,GI,GenBank Accession,...,Platform_ORF,Platform_SPOTID,Chromosome location,Chromosome annotation,GO:Function,GO:Process,GO:Component,GO:Function ID,GO:Process ID,GO:Component ID
0,1007_s_at,microRNA 4640///discoidin domain receptor tyro...,MIR4640///DDR1,100616237///780,,,,"Human receptor tyrosine kinase DDR gene, compl...",1753221.0,U48705,...,,,6p21.3,"Chromosome 6, NC_000006.12 (30890883..30890972...",ATP binding///collagen binding///collagen bind...,branching involved in mammary gland duct morph...,basolateral plasma membrane///extracellular ex...,GO:0005524///GO:0005518///GO:0005518///GO:0046...,GO:0060444///GO:0007155///GO:0038063///GO:0038...,GO:0016323///GO:0070062///GO:0005615///GO:0005...
1,1053_at,replication factor C subunit 2,RFC2,5982,,,,"Human replication factor C, 40-kDa subunit (A1...",1590810.0,M87338,...,,,7q11.23,"Chromosome 7, NC_000007.14 (74231502..74254458...",ATP binding///contributes_to DNA clamp loader ...,"DNA damage response, detection of DNA damage//...",Ctf18 RFC-like complex///DNA replication facto...,GO:0005524///contributes_to GO:0003689///GO:00...,GO:0042769///GO:0006260///GO:0070987///GO:0042...,GO:0031390///GO:0005663///GO:0005654
2,117_at,heat shock protein family A (Hsp70) member 6,HSPA6,3310,,,,Human heat-shock protein HSP70B' gene,35221.0,X51757,...,,,1q23,"Chromosome 1, NC_000001.11 (161524540..161526897)","ATP binding///ATPase activity, coupled///enzym...",NOT cellular heat acclimation///cellular respo...,colocalizes_with COP9 signalosome///blood micr...,GO:0005524///GO:0042623///GO:0019899///GO:0031...,NOT GO:0070370///GO:0034605///GO:0034605///GO:...,colocalizes_with GO:0008180///GO:0072562///GO:...
3,121_at,paired box 8,PAX8,7849,,,,H.sapiens Pax8 mRNA,38425.0,X69699,...,,,2q13,"Chromosome 2, NC_000002.12 (113215997..1132789...",DNA binding///DNA binding///RNA polymerase II ...,anatomical structure morphogenesis///branching...,nucleoplasm///nucleoplasm///nucleus,GO:0003677///GO:0003677///GO:0000978///GO:0000...,GO:0009653///GO:0001658///GO:0071371///GO:0007...,GO:0005654///GO:0005654///GO:0005634
4,1255_g_at,guanylate cyclase activator 1A,GUCA1A,2978,,,,Homo sapiens guanylate cyclase activating prot...,623404.0,L36861,...,,,6p21.1,"Chromosome 6, NC_000006.12 (42155377..42180083)",calcium ion binding///calcium sensitive guanyl...,cellular response to calcium ion///phototransd...,photoreceptor disc membrane///photoreceptor in...,GO:0005509///GO:0008048///GO:0030249,GO:0071277///GO:0007602///GO:0031284///GO:0022...,GO:0097381///GO:0001917///GO:0005886


In [12]:
import os
import pandas as pd
from pathlib import Path
from io import StringIO

# -------------------------------
# Define dataset + platform files
# -------------------------------
datasets = {
    "breast": ("../data/GSE15852_series_matrix.txt", "../data/GPL96.annot"),
    "ovarian": ("../data/GSE18520_series_matrix.txt", "../data/GPL570.annot"),
    "lung": ("../data/GSE31210_series_matrix.txt", "../data/GPL570.annot")
}



# Ensure processed directory exists
processed_dir = Path("data/mapped")
processed_dir.mkdir(parents=True, exist_ok=True)


# ----------------------------------
# Helper function: Load IDâ†”Gene map
# ----------------------------------
def load_probe_to_gene_map(gpl_file):
    """Reads GPL file and returns mapping of probe_id â†’ gene_symbol."""
    with open(gpl_file, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()

    # Find start of table
    start_idx = 0
    for i, line in enumerate(lines):
        if line.startswith("ID"):
            start_idx = i
            break

    # Read into dataframe
    df = pd.read_csv(gpl_file, sep="\t", skiprows=start_idx)

    # Normalize column names (handle case differences)
    df.columns = [c.strip().lower() for c in df.columns]

    if "gene symbol" in df.columns:
        id_col, gene_col = "id", "gene symbol"
    elif "gene_symbol" in df.columns:
        id_col, gene_col = "id", "gene_symbol"
    elif "gene symbol".replace(" ", "_") in df.columns:
        id_col, gene_col = "id", "gene_symbol"
    elif "gene symbol".lower() in df.columns:
        id_col, gene_col = "id", "gene symbol"
    elif "gene symbol".title() in df.columns:
        id_col, gene_col = "id", "gene symbol"
    elif "gene symbol" not in df.columns:
        raise ValueError("Gene symbol column not found in GPL file.")

    # Create mapping
    df = df[[id_col, gene_col]].dropna()
    mapping = dict(zip(df[id_col], df[gene_col]))
    print(f"âœ… Loaded {len(mapping)} probeâ†’gene mappings from {os.path.basename(gpl_file)}")
    return mapping



# ----------------------------------
# Helper function: Clean GEO dataset
# ----------------------------------
def process_dataset(gse_file, mapping, out_path):
    """Processes GEO matrix file and maps probe IDs to gene symbols."""
    print(f"Processing {gse_file} ...")

    # Read file & drop metadata lines (those starting with '!','^','#')
    with open(gse_file, "r", encoding="utf-8", errors="ignore") as f:
        lines = [line for line in f if not line.startswith(("!", "^", "#"))]
        
    df = pd.read_csv(StringIO("".join(lines)), sep="\t")
    df = df.rename(columns={df.columns[0]: "ID"})

    # Map probe IDs â†’ gene symbols
    df["Gene"] = df["ID"].map(mapping)

    # Drop rows without gene symbol
    df = df.dropna(subset=["Gene"])

    # Remove duplicate genes by averaging expression values
    df = df.groupby("Gene").mean(numeric_only=True)

    # Save to processed folder
    df.to_csv(out_path)
    print(f"âœ… Saved processed file: {out_path}")


# ----------------------------
# Main processing loop
# ----------------------------
for disease, (gse_path, gpl_path) in datasets.items():
    mapping = load_probe_to_gene_map(gpl_path)
    output_path = processed_dir / f"{disease}_mapped.csv"
    process_dataset(gse_path, mapping, output_path)

print("\nðŸŽ‰ All datasets processed and saved in data/mapped/")


âœ… Loaded 21156 probeâ†’gene mappings from GPL96.annot
Processing ../data/GSE15852_series_matrix.txt ...
âœ… Saved processed file: data\mapped\breast_mapped.csv


  df = pd.read_csv(gpl_file, sep="\t", skiprows=start_idx)


âœ… Loaded 45118 probeâ†’gene mappings from GPL570.annot
Processing ../data/GSE18520_series_matrix.txt ...
âœ… Saved processed file: data\mapped\ovarian_mapped.csv


  df = pd.read_csv(gpl_file, sep="\t", skiprows=start_idx)


âœ… Loaded 45118 probeâ†’gene mappings from GPL570.annot
Processing ../data/GSE31210_series_matrix.txt ...
âœ… Saved processed file: data\mapped\lung_mapped.csv

ðŸŽ‰ All datasets processed and saved in data/mapped/


In [11]:
import os
print(os.getcwd())


C:\Users\Admin\Desktop\comb_disease_predictor\scripts
