In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE10950_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM277495,GSM277496,GSM277497,GSM277498,GSM277499,GSM277500,GSM277501,GSM277502,GSM277503,...,GSM277533,GSM277534,GSM277535,GSM277536,GSM277537,GSM277538,GSM277539,GSM277540,GSM277541,GSM277542
0,ILMN_1343291,56190.14,52176.93,50483.09,38690.04,54803.84,52006.33,45847.25,66288.21,48275.86,...,38542.71,37837.82,43446.01,36295.71,46553.41,44413.68,21908.48,59918.8,43582.32,55191.75
1,ILMN_1343292,1971.443,606.7773,8916.936,3148.337,3906.875,1553.541,6240.805,883.6413,11706.21,...,6120.768,934.4418,4432.626,478.1351,4602.682,929.2711,3352.194,857.0108,750.6904,6049.861
2,ILMN_1343293,16093.71,18485.05,10451.59,23610.13,11740.43,20074.07,9185.637,24564.12,6891.894,...,8256.981,15924.61,11993.99,19011.34,11475.96,15174.88,2004.747,10197.33,15732.33,11330.64
3,ILMN_1343294,54993.97,46674.46,48284.97,36091.75,62380.23,56790.41,44195.88,82740.92,50603.63,...,37784.98,36788.32,42816.73,32474.53,45212.2,42259.02,104338.8,64718.43,38275.22,53299.56
4,ILMN_1651209,40.00085,35.96349,31.27827,18.39922,59.5614,38.47763,19.27786,37.25268,50.21552,...,24.60747,11.57533,26.18094,36.19663,25.2418,34.48675,5.74692,16.77745,41.0845,16.70877


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,ILMN_1343291,ILMN_1343292,ILMN_1343293,ILMN_1343294,ILMN_1651209,ILMN_1651217,ILMN_1651228,ILMN_1651229,ILMN_1651234,...,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,ILMN_2038774,ILMN_2038777,ILMN_2038778
0,GSM277495,56190.14,1971.443,16093.71,54993.97,40.00085,5.684647,31040.54,223.2116,-2.187667,...,21.15968,419.1906,173.02,111.4956,8.302612,624.8198,200.0302,39215.89,28558.14,11303.5
1,GSM277496,52176.93,606.7773,18485.05,46674.46,35.96349,18.28151,30282.59,221.1387,0.335718,...,13.90939,1017.365,293.4511,51.37997,-7.676422,89.04105,297.8348,48556.75,26478.78,21009.94
2,GSM277497,50483.09,8916.936,10451.59,48284.97,31.27827,13.90439,26012.85,431.4342,-10.48382,...,9.122181,619.0782,139.2744,47.95465,4.253619,331.57,160.6262,42814.16,42527.69,14241.25
3,GSM277498,38690.04,3148.337,23610.13,36091.75,18.39922,1.182301,25750.38,178.7281,5.197227,...,-3.768425,1847.47,354.8188,67.30951,-4.145857,222.8831,215.5376,37049.47,33330.46,32106.39
4,GSM277499,54803.84,3906.875,11740.43,62380.23,59.5614,18.28643,28069.13,376.0385,6.185607,...,30.74828,462.7539,180.9093,372.5614,-9.525824,307.9792,167.734,38649.68,36737.0,11845.22


In [6]:
# gene expression data
metadata_filepath = "GSE10950_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE10950_merged_data.csv", index=False)
print("Merged data saved as GSE10950_merged_data.csv")

Merged data saved as GSE10950_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Columns: 22186 entries, geo_accession to ILMN_2038778
dtypes: float64(22184), int64(1), object(1)
memory usage: 8.1+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,ILMN_1343291,ILMN_1343292,ILMN_1343293,ILMN_1343294,ILMN_1651209,ILMN_1651217,ILMN_1651228,ILMN_1651229,...,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,ILMN_2038774,ILMN_2038777,ILMN_2038778
0,GSM277495,0,56190.14,1971.443,16093.71,54993.97,40.00085,5.684647,31040.54,223.2116,...,21.15968,419.1906,173.02,111.4956,8.302612,624.8198,200.0302,39215.89,28558.14,11303.5
1,GSM277496,1,52176.93,606.7773,18485.05,46674.46,35.96349,18.28151,30282.59,221.1387,...,13.90939,1017.365,293.4511,51.37997,-7.676422,89.04105,297.8348,48556.75,26478.78,21009.94
2,GSM277497,0,50483.09,8916.936,10451.59,48284.97,31.27827,13.90439,26012.85,431.4342,...,9.122181,619.0782,139.2744,47.95465,4.253619,331.57,160.6262,42814.16,42527.69,14241.25
3,GSM277498,1,38690.04,3148.337,23610.13,36091.75,18.39922,1.182301,25750.38,178.7281,...,-3.768425,1847.47,354.8188,67.30951,-4.145857,222.8831,215.5376,37049.47,33330.46,32106.39
4,GSM277499,0,54803.84,3906.875,11740.43,62380.23,59.5614,18.28643,28069.13,376.0385,...,30.74828,462.7539,180.9093,372.5614,-9.525824,307.9792,167.734,38649.68,36737.0,11845.22
5,GSM277500,1,52006.33,1553.541,20074.07,56790.41,38.47763,3.046845,37906.21,313.6529,...,4.33756,1538.207,327.6797,160.8887,3.173003,390.2606,169.4709,19928.72,35074.84,31232.45
6,GSM277501,0,45847.25,6240.805,9185.637,44195.88,19.27786,44.41323,26729.13,494.9206,...,-4.681081,569.0219,163.229,17.71757,-1.507952,392.256,230.4232,39478.76,40078.56,13788.28
7,GSM277502,1,66288.21,883.6413,24564.12,82740.92,37.25268,11.14272,36211.12,186.498,...,9.046455,871.6389,253.3224,187.331,8.515198,379.5339,183.5179,40934.46,27459.34,23013.48
8,GSM277503,0,48275.86,11706.21,6891.894,50603.63,50.21552,-3.433882,23780.8,719.8223,...,22.373,629.3758,198.7061,17.82549,-7.033357,347.5759,197.3016,38290.77,49031.66,16026.82
9,GSM277504,1,44676.39,1229.109,20303.1,42153.91,37.5521,8.733508,29171.1,239.5794,...,6.796918,540.3752,242.8107,93.37427,0.251794,179.0247,326.8972,38734.47,35752.12,26704.86


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22179,ILMN_1815941,SMAD7
22180,ILMN_1815951,PCYOX1L
22181,ILMN_2038774,EEF1A1
22182,ILMN_2038777,ACTB


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22178,ILMN_1815941,SMAD7
22179,ILMN_1815951,PCYOX1L
22180,ILMN_2038774,EEF1A1
22181,ILMN_2038777,ACTB


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'ILMN_1343291': 'EEF1A1',
 'ILMN_1343292': 'TUBB',
 'ILMN_1343293': 'TXN',
 'ILMN_1343294': 'ACTB',
 'ILMN_1651209': 'SLC35E2',
 'ILMN_1651217': 'PDCD1LG2',
 'ILMN_1651228': 'RPS28',
 'ILMN_1651229': 'IPO13',
 'ILMN_1651234': 'SYT14',
 'ILMN_1651235': 'AFAP1',
 'ILMN_1651236': 'GGTLA4',
 'ILMN_1651237': 'CDT1',
 'ILMN_1651238': 'TRPV1',
 'ILMN_1651254': 'LPP',
 'ILMN_1651259': 'UGP2',
 'ILMN_1651260': 'CCNE2',
 'ILMN_1651261': 'RSU1',
 'ILMN_1651262': 'HNRPAB',
 'ILMN_1651268': 'LOH12CR1',
 'ILMN_1651278': 'SNIP1',
 'ILMN_1651282': 'COL17A1',
 'ILMN_1651286': 'GRHL1',
 'ILMN_1651296': 'LOC143666',
 'ILMN_1651298': 'RAD17',
 'ILMN_1651303': 'ATP13A4',
 'ILMN_1651316': 'CD69',
 'ILMN_1651330': 'KCNG4',
 'ILMN_1651336': 'MLYCD',
 'ILMN_1651339': 'KIAA0701',
 'ILMN_1651343': 'ITGA11',
 'ILMN_1651346': 'TICAM2',
 'ILMN_1651349': 'CD86',
 'ILMN_1651354': 'SPP1',
 'ILMN_1651358': 'HBE1',
 'ILMN_1651364': 'PCBD2',
 'ILMN_1651370': 'USP21',
 'ILMN_1651373': 'RHD',
 'ILMN_1651375': 'BICD2',
 'I

In [15]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,EEF1A1,TUBB,TXN,ACTB,SLC35E2,PDCD1LG2,RPS28,IPO13,...,SMCR7,NUP107,FTSJ2,MGC9712,TRPM3,SMAD7,PCYOX1L,EEF1A1.1,ACTB.1,GAPDH
0,GSM277495,0,56190.14,1971.443,16093.71,54993.97,40.00085,5.684647,31040.54,223.2116,...,21.15968,419.1906,173.02,111.4956,8.302612,624.8198,200.0302,39215.89,28558.14,11303.5
1,GSM277496,1,52176.93,606.7773,18485.05,46674.46,35.96349,18.28151,30282.59,221.1387,...,13.90939,1017.365,293.4511,51.37997,-7.676422,89.04105,297.8348,48556.75,26478.78,21009.94
2,GSM277497,0,50483.09,8916.936,10451.59,48284.97,31.27827,13.90439,26012.85,431.4342,...,9.122181,619.0782,139.2744,47.95465,4.253619,331.57,160.6262,42814.16,42527.69,14241.25
3,GSM277498,1,38690.04,3148.337,23610.13,36091.75,18.39922,1.182301,25750.38,178.7281,...,-3.768425,1847.47,354.8188,67.30951,-4.145857,222.8831,215.5376,37049.47,33330.46,32106.39
4,GSM277499,0,54803.84,3906.875,11740.43,62380.23,59.5614,18.28643,28069.13,376.0385,...,30.74828,462.7539,180.9093,372.5614,-9.525824,307.9792,167.734,38649.68,36737.0,11845.22
5,GSM277500,1,52006.33,1553.541,20074.07,56790.41,38.47763,3.046845,37906.21,313.6529,...,4.33756,1538.207,327.6797,160.8887,3.173003,390.2606,169.4709,19928.72,35074.84,31232.45
6,GSM277501,0,45847.25,6240.805,9185.637,44195.88,19.27786,44.41323,26729.13,494.9206,...,-4.681081,569.0219,163.229,17.71757,-1.507952,392.256,230.4232,39478.76,40078.56,13788.28
7,GSM277502,1,66288.21,883.6413,24564.12,82740.92,37.25268,11.14272,36211.12,186.498,...,9.046455,871.6389,253.3224,187.331,8.515198,379.5339,183.5179,40934.46,27459.34,23013.48
8,GSM277503,0,48275.86,11706.21,6891.894,50603.63,50.21552,-3.433882,23780.8,719.8223,...,22.373,629.3758,198.7061,17.82549,-7.033357,347.5759,197.3016,38290.77,49031.66,16026.82
9,GSM277504,1,44676.39,1229.109,20303.1,42153.91,37.5521,8.733508,29171.1,239.5794,...,6.796918,540.3752,242.8107,93.37427,0.251794,179.0247,326.8972,38734.47,35752.12,26704.86


In [16]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE10950_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE10950_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE10950_gene_expression_renamed.csv'
