In [21]:
import pandas as pd
from scipy.stats import ttest_ind

In [22]:
# Load your data
df = pd.read_csv("GSE44861_gene_expression_renamed.csv")
df.head()

Unnamed: 0,geo_accession,target,DDR1,RFC2,HSPA6,PAX8,GUCA1A,UBE1L,THRA,PTPN21,...,LRCH4.4,LRTM1.1,DGCR8.3,91682_at,EXOSC4.2,"MGC15523,EHBP1L1",RKHD1.1,EPS8L1.4,BCAN.2,LOC90379.3
0,GSM1092909,0,11.7622,6.29542,5.5566,7.8109,5.24249,9.1622,5.74971,5.90899,...,7.86943,4.98448,7.18908,6.43754,6.41134,6.66857,6.01868,9.06129,5.89208,6.47189
1,GSM1092910,1,10.5785,6.77451,5.58586,7.66729,5.21854,7.73865,5.97187,5.52396,...,7.39007,5.03993,7.93754,6.6112,6.65593,6.01227,5.71611,9.28618,5.89694,6.26747
2,GSM1092911,0,10.8828,6.30281,5.61578,8.43118,5.33201,8.72661,6.56693,5.67954,...,10.0092,5.17202,8.47201,6.86043,6.45193,6.36926,5.39938,10.2493,6.24988,6.05145
3,GSM1092912,1,11.233,8.18885,6.0957,7.81232,5.30253,8.26605,5.69351,5.79842,...,7.96354,4.9383,7.18008,6.69417,7.1877,6.14987,7.98777,8.50138,6.00022,6.56582
4,GSM1092913,0,10.0557,6.27496,5.99864,7.52846,5.30274,8.74045,6.33641,5.81155,...,9.09736,5.016,7.95282,6.7563,6.34023,6.49607,5.50074,8.46019,5.874,6.52917


In [23]:
# Separate groups based on 'target' (0 = control, 1 = case)
group_0 = df[df['target'] == 0].drop(columns=['geo_accession', 'target'])
group_1 = df[df['target'] == 1].drop(columns=['geo_accession', 'target'])

In [24]:
# Perform t-test for each gene
deg_results = []
for gene in group_0.columns:
    stat, p_value = ttest_ind(group_0[gene], group_1[gene], equal_var=False)
    deg_results.append({'Gene': gene, 'p_value': p_value})

# Convert results to DataFrame
df_deg = pd.DataFrame(deg_results)

In [25]:
df_deg

Unnamed: 0,Gene,p_value
0,DDR1,0.069367
1,RFC2,0.000625
2,HSPA6,0.113509
3,PAX8,0.795098
4,GUCA1A,0.826225
...,...,...
22210,"MGC15523,EHBP1L1",0.243882
22211,RKHD1.1,0.000019
22212,EPS8L1.4,0.068117
22213,BCAN.2,0.187490


In [26]:
# Filter DEG based on raw p-value threshold (e.g., 0.05)
deg_threshold = 0.05
deg_significant = df_deg[df_deg['p_value'] < deg_threshold]

# Output the significant DEGs
print(f"Significant DEGs (p-value < {deg_threshold}):")
print(deg_significant)


Significant DEGs (p-value < 0.05):
           Gene       p_value
1          RFC2  6.252035e-04
5         UBE1L  2.843866e-08
8          CCL5  5.027944e-05
10        EPHB3  5.823502e-03
11        ESRRA  7.032921e-05
...         ...           ...
22204  CENTA1.1  3.279836e-03
22205   LRCH4.4  3.933905e-04
22207   DGCR8.3  3.260815e-03
22209  EXOSC4.2  1.596272e-05
22211   RKHD1.1  1.946769e-05

[8055 rows x 2 columns]


In [27]:
# Save the final DEG results (with raw p-values) to a CSV file
deg_significant.to_csv('deg_raw_44861.csv', index=False)

print("DEG results saved to 'deg_raw_44861.csv'")

DEG results saved to 'deg_raw_44861.csv'


In [28]:
# Apply multiple testing correction (FDR using Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

df_deg['adj_p_value'] = multipletests(df_deg['p_value'], method='fdr_bh')[1]

# Filter significantly differentially expressed genes (adj p-value < 0.05)
df_deg_filtered = df_deg[df_deg['adj_p_value'] < 0.05]

In [29]:
df_deg_filtered.shape

(5978, 3)

In [30]:
df_deg_filtered.head()

Unnamed: 0,Gene,p_value,adj_p_value
1,RFC2,0.0006252035,0.004295
5,UBE1L,2.843866e-08,1e-06
8,CCL5,5.027944e-05,0.000565
10,EPHB3,0.005823502,0.0259
11,ESRRA,7.032921e-05,0.000742


In [31]:
df_deg_filtered.tail()

Unnamed: 0,Gene,p_value,adj_p_value
22204,CENTA1.1,0.00328,0.016518
22205,LRCH4.4,0.000393,0.00295
22207,DGCR8.3,0.003261,0.016434
22209,EXOSC4.2,1.6e-05,0.00022
22211,RKHD1.1,1.9e-05,0.000261


In [32]:
# Save results
df_deg_filtered.to_csv("DEG_results_44861.csv", index=False)

print("DEG Analysis completed! Significant genes saved in 'DEG_results_44861.csv'")

DEG Analysis completed! Significant genes saved in 'DEG_results_44861.csv'
