In [2]:
import pandas as pd
import numpy as np
import scipy.stats as sts
from statsmodels.stats.multitest import multipletests

In [3]:
genes = pd.read_csv('gene_high_throughput_sequencing.csv')

In [4]:
genes.head(10)

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.76125,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.23279,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.20077,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.22777,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917
5,STT5446_Breast_005_normal,normal,3.942275,4.488477,8.944837,12.581469,23.543887,8.683617,5.418139,50.525641,...,8.683617,1.314092,1.314092,1.314092,1.314092,1.314092,18.001936,1.314092,1.314092,1.314092
6,STT5451_Breast_006_normal,normal,1.084113,3.25234,8.917889,11.249037,19.179923,6.712171,12.439778,52.498406,...,9.091914,1.084113,1.084113,1.084113,1.084113,1.084113,11.65947,1.084113,1.084113,1.084113
7,STT5466_Breast_007_normal,normal,3.1539,1.64707,4.941211,11.529492,13.813151,8.235352,1.64707,44.226216,...,10.546396,1.64707,1.64707,1.64707,1.64707,1.64707,21.071346,1.64707,1.64707,1.64707
8,STT5472_Breast_008_normal,normal,2.5518,3.838876,16.255376,12.972108,20.169502,7.097877,5.333458,52.003891,...,9.274845,1.013982,1.013982,1.013982,1.013982,1.013982,11.030215,1.013982,1.013982,1.013982
9,STT5475_Breast_009_normal,normal,3.693128,1.231043,11.079385,12.453221,21.024058,6.780055,6.155214,49.174985,...,8.848678,1.231043,1.231043,1.231043,1.231043,1.231043,24.8963,1.231043,1.231043,1.231043


In [5]:
genes[['Diagnosis']].value_counts()

Diagnosis      
early neoplasia    25
normal             24
cancer             23
dtype: int64

# #1

In [6]:
genes_normal = genes[genes.Diagnosis == 'normal']
genes_early = genes[genes.Diagnosis == 'early neoplasia']
genes_cancer = genes[genes.Diagnosis == 'cancer']
print(genes_normal.shape, genes_early.shape, genes_cancer.shape)

(24, 15750) (25, 15750) (23, 15750)


In [7]:
genes.isnull().all().all()

False

In [8]:
alpha = 0.05
p_values = pd.DataFrame(index = ['norm-early', 'early-cancer'], columns = genes.columns[2:])

for gene in genes.columns[2:]:
    p1 = sts.ttest_ind(genes_normal[gene], genes_early[gene], equal_var = False).pvalue
    p_values[gene]['norm-early'] = p1

    p2 = sts.ttest_ind(genes_early[gene], genes_cancer[gene], equal_var = False).pvalue
    p_values[gene]['early-cancer'] = p2


In [9]:
print('statistically significant differences between normal and early: %d' %(p_values.loc['norm-early'] < alpha).sum())
print('statistically significant differences between early and cancer: %d' %(p_values.loc['early-cancer'] < alpha).sum())

statistically significant differences between normal and early: 1575
statistically significant differences between early and cancer: 3490


In [11]:
with open("bio_anser_1_1.txt", "w") as fout:
    fout.write(str((p_values.loc['norm-early'] < alpha).sum()))

In [16]:
with open("bio_anser_1_2.txt", "w") as fout:
    fout.write(str((p_values.loc['early-cancer'] < alpha).sum()))

# #2

In [17]:
def fold_change(C, T):
    return T/C if T > C else -C/T


In [18]:
alpha_2 = 0.025

rejected_1, p_corrected_1, _, _ = multipletests(p_values.loc['norm-early'], alpha_2, method = 'holm')
rejected_2, p_corrected_2, _, _ = multipletests(p_values.loc['early-cancer'], alpha_2, method = 'holm')


In [19]:
indices_1 = np.array([np.abs(fold_change(genes_normal[gene].mean(), genes_early[gene].mean())) > 1.5 
             for gene in genes.columns[2:]])
indices_2 = np.array([np.abs(fold_change(genes_early[gene].mean(), genes_cancer[gene].mean())) > 1.5 
             for gene in genes.columns[2:]])

In [20]:
print('statistically significant differences between normal and early with Holm: %d' %(rejected_1 & indices_1).sum())
print('statistically significant differences between early and cancer with Holm: %d' %(rejected_2 & indices_2).sum())

statistically significant differences between normal and early with Holm: 2
statistically significant differences between early and cancer with Holm: 77


In [21]:
with open("bio_anser_2_1.txt", "w") as fout:
    fout.write(str(rejected_1[indices_1].sum()))

In [22]:
with open("bio_anser_2_2.txt", "w") as fout:
    fout.write(str(rejected_2[indices_2].sum()))

# #3

In [23]:
rejected_1, p_corrected_1, _, _ = multipletests(p_values.loc['norm-early'], alpha_2, method = 'fdr_bh')
rejected_2, p_corrected_2, _, _ = multipletests(p_values.loc['early-cancer'], alpha_2, method = 'fdr_bh')

In [24]:
print('statistically significant differences between normal and early with Benjamini/Hochberg: %d' %(rejected_1 & indices_1).sum())
print('statistically significant differences between early and cancer with Benjamini/Hochberg: %d' %(rejected_2 & indices_2).sum())

statistically significant differences between normal and early with Benjamini/Hochberg: 4
statistically significant differences between early and cancer with Benjamini/Hochberg: 524


In [25]:
with open("bio_anser_3_1.txt", "w") as fout:
    fout.write(str(rejected_1[indices_1].sum()))

In [26]:
with open("bio_anser_3_2.txt", "w") as fout:
    fout.write(str(rejected_2[indices_2].sum()))