In [12]:
import pandas as pd

In [13]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE10950_geneID.csv")

# Load the platform annotation file (tab-separated format)
gpl6104_data = pd.read_csv("GPL6104_11576.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl6104_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl6104_data[['ID', 'Symbol']], left_on="ID_REF", right_on="ID", how="left")


Index(['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene',
       'Source_Reference_ID', 'RefSeq_ID', 'Entrez_Gene_ID', 'GI', 'Accession',
       'Symbol', 'Protein_Product', 'Array_Address_Id', 'Probe_Type',
       'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation',
       'Probe_Coordinates', 'Definition', 'Ontology_Component',
       'Ontology_Process', 'Ontology_Function', 'Synonyms', 'GB_ACC'],
      dtype='object')


In [14]:
merged_data.head()

Unnamed: 0,ID_REF,ID,Symbol
0,ILMN_1343291,ILMN_1343291,EEF1A1
1,ILMN_1343292,ILMN_1343292,TUBB
2,ILMN_1343293,ILMN_1343293,TXN
3,ILMN_1343294,ILMN_1343294,ACTB
4,ILMN_1651209,ILMN_1651209,SLC35E2


In [15]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22179,ILMN_1815941,SMAD7
22180,ILMN_1815951,PCYOX1L
22181,ILMN_2038774,EEF1A1
22182,ILMN_2038777,ACTB


In [16]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['Symbol'].str.split(' /// ')).explode('Gene_Symbol')

In [17]:
# replace ' /// ' with ','
merged_data["Symbol"] = merged_data["Symbol"].str.replace(" /// ", ",")

In [18]:
merged_data= merged_data.rename(columns={"Gene_Symbol": "Cleaned_Gene_Symbol"})

In [19]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [20]:
merged_data.head()

Unnamed: 0,ID_REF,Symbol,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1,EEF1A1
1,ILMN_1343292,TUBB,TUBB
2,ILMN_1343293,TXN,TXN
3,ILMN_1343294,ACTB,ACTB
4,ILMN_1651209,SLC35E2,SLC35E2


In [21]:
merged_data = merged_data.dropna()

In [22]:
merged_data = merged_data.drop(columns="Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22179,ILMN_1815941,SMAD7
22180,ILMN_1815951,PCYOX1L
22181,ILMN_2038774,EEF1A1
22182,ILMN_2038777,ACTB


In [23]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
