In [50]:
import pandas as pd
from scipy.stats import chi2_contingency 
from scipy import stats
import numpy as np
import statsmodels.stats.api as sms
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [123]:
data = pd.read_csv('data/gene_prot_data.csv')

In [122]:
data

Unnamed: 0,Subject_ID,Diagnosis,Age,APOE,MMSE,Sex,PC1,PC2,PC3,PC4,...,xu.S6OENiSSCDruXFc,xu3n8ix_RpCpNPRKhc,xuBm5ckgidWRNTl.gQ,xuNOUeR1JerhIuIV7c,xud4v2r3nXECBua55o,xvHfVEL1fC35XJ7neU,xve7nuGHtAizelOmhE,xvrrv4q_nIDgJej.uU,xws9e3UChad1OnXmXY,xz1S1tKD.sgqfTuesU
0,DCR00025,0,75.000000,0,moderate,0,0.0088,0.0276,0.0132,0.0005,...,0.010746,0.424735,0.398869,0.066384,0.321479,0.149548,0.095191,0.004099,0.253679,0.000000
1,DCR00028,0,88.000000,1,moderate,0,0.0114,0.0284,0.0192,-0.0095,...,0.239652,0.297554,0.341023,0.071577,0.429470,0.233205,0.268920,0.158143,0.277040,0.361951
2,DCR00031,0,75.461386,0,moderate,0,0.0097,28.0000,13.0000,-0.0234,...,0.058596,0.438859,0.216219,0.599177,0.179004,0.193932,0.098406,0.122535,0.141998,0.263429
3,DCR00032,0,75.461386,0,moderate,1,0.0076,28.0000,9.0000,-0.0011,...,0.287620,0.579192,0.211476,0.200429,0.001667,0.218742,0.263119,0.261860,0.221761,0.162624
4,DCR00037,0,75.461386,2,moderate,1,0.0097,0.0244,18.0000,-0.0043,...,0.316210,0.422101,0.247895,0.154150,0.334002,0.184145,0.304057,0.309619,0.165162,0.385041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,THSMCI061,2,67.000000,1,high,1,0.0073,-0.0539,0.0073,0.0216,...,0.589071,0.961699,0.622341,0.796417,0.776029,0.823258,0.585026,0.371426,0.803298,0.525950
513,THSMCI064,2,80.000000,1,high,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
514,THSMCI064,0,80.000000,1,moderate,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
515,THSMCI065,2,76.000000,2,high,0,0.0011,-0.0498,0.0126,0.0303,...,0.659313,0.911345,0.773895,0.843128,0.824156,0.783705,0.709999,0.510799,0.855688,0.716505


### 3. Statistical Tests

#### Chi Squared Tests

##### H₀ — that the 2 categorical variables being compared are independent of each other (no association) 
##### H₁ — that the 2 categorical variables being compared are dependent on each other (association).

##### 1. Gender and AD Diagnosis

In [111]:
data['Sex'] = data['Sex'].replace({0:'Female', 1:'Male'})
data['Diagnosis'] = data['Diagnosis'].replace({0:'AD', 1:'CTL', 2:'MCI'})  

chi = pd.crosstab(data.Sex, data.Diagnosis,margins=True)
print(chi)

obs = np.array([chi.iloc[0][0:2].values,chi.iloc[1][0:2].values])
chi_squared = stats.chi2_contingency(obs, correction=False)
print("\n(X^2, p, df):",chi_squared[0:3])

print("\nConclusion:")
if(chi_squared[1] < 0.05):
    print("At 5% significance level, we fail to reject Ha which means there is sufficient evidence to suggest that there is an association between AD diagnosis and the gender feature")
else:
    print("At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the gender feature")
    

Diagnosis   AD  CTL  MCI  All
Sex                          
Female      84   57   55  196
Male       164   85   72  321
All        248  142  127  517

(X^2, p, df): (1.537815592898336, 0.2149432428442663, 1)

Conclusion:
At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the gender feature


Given that the p-value is 0.2149432428442663, which is much higher than the common significance level of 0.05, we fail to reject the null hypothesis. This means there is not enough evidence to suggest that there is a statistically significant association between sex and diagnosis category in this sample.

##### 2. APOE and AD Diagnosis

In [112]:
data['APOE'] = data['APOE'].replace({0:'E4E4', 1:'E3E3', 2:'E3E4', 3:'E2E3', 4:'E2E4'})
data['Diagnosis'] = data['Diagnosis'].replace({0:'AD', 1:'CTL', 2:'MCI'})  

chi = pd.crosstab(data.APOE, data.Diagnosis,margins=True)
print(chi)

obs = np.array([chi.iloc[0][0:2].values,chi.iloc[1][0:2].values])
chi_squared = stats.chi2_contingency(obs, correction=False)
print("\n(X^2, p, df):",chi_squared[0:3])

print("\nConclusion:")
if(chi_squared[1] < 0.05):
    print("At 5% significance level, we fail to reject Ha which means there is sufficient evidence to suggest that there is an association between AD diagnosis and the APOE feature")
else:
    print("At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the APOE feature")
    

Diagnosis   AD  CTL  MCI  All
APOE                         
E2E3        16   23   12   51
E2E4         4    4    2   10
E3E3       104   37   46  187
E3E4        90   72   61  223
E4E4        34    6    6   46
All        248  142  127  517

(X^2, p, df): (0.2187084520417853, 0.6400257800420942, 1)

Conclusion:
At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the APOE feature


##### 3. MMSE and AD Diagnosis

In [113]:
#create new catgeory based on MMSE score: 0-0.25: very low, 0.25-0.5: low, 0.5-0.75: moderate, 0.75-1: high
data['MMSE_Cat'] = pd.cut(data['MMSE'], bins=[0,0.5,0.75,1], labels=['low','moderate','high'])

In [114]:
data['Diagnosis'] = data['Diagnosis'].replace({0:'AD', 1:'CTL', 2:'MCI'})  

chi = pd.crosstab(data.MMSE_Cat, data.Diagnosis,margins=True)
print(chi)

obs = np.array([chi.iloc[0][0:2].values,chi.iloc[1][0:2].values])
chi_squared = stats.chi2_contingency(obs, correction=True)
print("\n(X^2, p, df):",chi_squared[0:3])

print("\nConclusion:")
if(chi_squared[1] < 0.05):
    print("At 5% significance level, we fail to reject Ha which means there is sufficient evidence to suggest that there is an association between AD diagnosis and the MMSE score")
else:
    print("At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the MMSE score")
    

Diagnosis   AD  CTL  MCI  All
MMSE_Cat                     
low         97    0    4  101
moderate    85    1   14  100
high        65  141  109  315
All        247  142  127  516

(X^2, p, df): (0.0036461530557670577, 0.9518502922042056, 1)

Conclusion:
At 5% significance level, we fail to reject Ho which means there is sufficient evidence to suggest that there is no association between AD diagnosis and the MMSE score


### Confidence Interval

#### 1. Gender and Diagnosis

In [115]:
#95% confidence interval for difference in mean
data['Sex'] = data['Sex'].replace({'Female':0, 'Male':1})
data['Diagnosis'] = data['Diagnosis'].replace({'AD':0, 'CTL':1, 'MCI':2})  
female = data.query('Sex == 0')
male = data.query('Sex == 1')

cm = sms.CompareMeans(sms.DescrStatsW(female['Diagnosis']), sms.DescrStatsW(male['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nGiven that the value of zero is included in the interval, this suggest that the means of the two population can be assumed equal as there is a difference between them. ")


Confidence Interval for u1-u2:  (-0.008010776829207589, 0.2853011322237019)

Given that the value of zero is included in the interval, this suggest that the means of the two population can be assumed equal as there is a difference between them. 


#### 2. APOE and Diagnosis

In [116]:
#95% confidence interval for difference in mean
data['APOE'] = data['APOE'].replace({'E4E4':0, 'E3E3':1, 'E3E4':2, 'E2E3':3, 'E2E4':4})
e4e4 = data.query('APOE == 0')
e3e3 = data.query('APOE == 1')
e3e4 = data.query('APOE == 2')
e2e3 = data.query('APOE == 3')

print("\nE4E4 vs E3E3")
cm = sms.CompareMeans(sms.DescrStatsW(e4e4['Diagnosis']), sms.DescrStatsW(e3e3['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nE4E4 vs E3E4")
cm = sms.CompareMeans(sms.DescrStatsW(e4e4['Diagnosis']), sms.DescrStatsW(e3e4['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nE4E4 vs E2E3")
cm = sms.CompareMeans(sms.DescrStatsW(e4e4['Diagnosis']), sms.DescrStatsW(e2e3['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nE3E3 vs E3E4")
cm = sms.CompareMeans(sms.DescrStatsW(e3e3['Diagnosis']), sms.DescrStatsW(e3e4['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nE3E3 vs E2E3")
cm = sms.CompareMeans(sms.DescrStatsW(e3e3['Diagnosis']), sms.DescrStatsW(e2e3['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nE3E4 vs E2E3")
cm = sms.CompareMeans(sms.DescrStatsW(e3e4['Diagnosis']), sms.DescrStatsW(e2e3['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))


E4E4 vs E3E3
Confidence Interval for u1-u2:  (-0.541390023851572, -0.055680424881280804)

E4E4 vs E3E4
Confidence Interval for u1-u2:  (-0.7150975430506388, -0.24220407519853254)

E4E4 vs E2E3
Confidence Interval for u1-u2:  (-0.8243880437111856, -0.23614051553860105)

E3E3 vs E3E4
Confidence Interval for u1-u2:  (-0.341874553778563, -0.018356615737755516)

E3E3 vs E2E3
Confidence Interval for u1-u2:  (-0.4722810145422946, 0.008822904025360623)

E3E4 vs E2E3
Confidence Interval for u1-u2:  (-0.2856673902729793, 0.18244044927236386)


Based on the confidence intervals, the differences between certain APOE genotypes suggest the following for Alzheimer's disease (AD) diagnosis:

- E4E4 vs E3E3: The confidence interval does not contain zero, indicating a statistically significant difference between these two genotypes for AD diagnosis. The E4E4 genotype appears to be associated with a higher risk of AD compared to E3E3.
- E4E4 vs E3E4: The confidence interval does not contain zero, suggesting a significant difference between E4E4 and E3E4 genotypes for AD diagnosis. The E4E4 genotype likely has a higher risk of AD than E3E4.
- E4E4 vs E2E3: The confidence interval does not contain zero, implying a significant difference between E4E4 and E2E3 genotypes. The E4E4 genotype is likely associated with a higher risk of AD compared to E2E3.
- E3E3 vs E3E4: The confidence interval does not contain zero, indicating a significant difference between these genotypes for AD diagnosis. The E3E4 genotype may have a higher risk of AD than E3E3.
- E3E3 vs E2E3 and E3E4 vs E2E3: The confidence intervals contain zero, suggesting no statistically significant difference between these genotype pairs for AD diagnosis.


In summary, the presence of the E4 allele, particularly the E4E4 genotype, appears to be associated with a higher risk of Alzheimer's disease compared to other APOE genotypes like E3E3, E3E4, and E2E3.

#### 3. MMSE and AD

In [119]:
data['MMSE'] = pd.cut(data['MMSE'], bins=[0,0.25, 0.5,0.75,1], labels=['very low','low','moderate','high'])
very_low = data.query('MMSE == "very low"')
low = data.query('MMSE == "low"')
moderate = data.query('MMSE == "moderate"')
high = data.query('MMSE == "high"')

print("\nVery Low vs Low")
cm = sms.CompareMeans(sms.DescrStatsW(very_low['Diagnosis']), sms.DescrStatsW(low['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nVery Low vs Moderate")
cm = sms.CompareMeans(sms.DescrStatsW(very_low['Diagnosis']), sms.DescrStatsW(moderate['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nVery Low vs High")
cm = sms.CompareMeans(sms.DescrStatsW(very_low['Diagnosis']), sms.DescrStatsW(high['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nLow vs Moderate")
cm = sms.CompareMeans(sms.DescrStatsW(low['Diagnosis']), sms.DescrStatsW(moderate['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nLow vs High")
cm = sms.CompareMeans(sms.DescrStatsW(low['Diagnosis']), sms.DescrStatsW(high['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))

print("\nModerate vs High")
cm = sms.CompareMeans(sms.DescrStatsW(moderate['Diagnosis']), sms.DescrStatsW(high['Diagnosis']))
print("Confidence Interval for u1-u2: ", cm.tconfint_diff(usevar='unequal'))


Very Low vs Low
Confidence Interval for u1-u2:  (-0.19517991106772828, -0.0023509531298025743)

Very Low vs Moderate
Confidence Interval for u1-u2:  (-0.4290096685427071, -0.15099033145729285)

Very Low vs High
Confidence Interval for u1-u2:  (-1.2207358907517094, -1.0586291886133699)

Low vs Moderate
Confidence Interval for u1-u2:  (-0.35938720431286864, -0.02308193148960047)

Low vs High
Confidence Interval for u1-u2:  (-1.1662836476249154, -0.915550567542633)

Moderate vs High
Confidence Interval for u1-u2:  (-1.0100960549637923, -0.6892690244012869)


Based on the confidence intervals comparing different MMSE score bands, this image suggests the following about the relationship between MMSE scores and Alzheimer's disease (AD) diagnosis:

- Very Low vs Low: The confidence interval contains zero, indicating there may not be a statistically significant difference in AD diagnosis between the "Very Low" and "Low" MMSE score bands.
- Very Low vs Moderate: The confidence interval does not contain zero, suggesting a statistically significant difference between the "Very Low" and "Moderate" MMSE bands for AD diagnosis. The "Very Low" band is likely associated with a higher risk of AD compared to the "Moderate" band.
- Very Low vs High: The confidence interval does not contain zero, implying a significant difference between the "Very Low" and "High" MMSE bands in terms of AD diagnosis. The "Very Low" band is probably associated with a higher risk of AD compared to the "High" band.
- Low vs Moderate: The confidence interval contains zero, indicating no statistically significant difference in AD diagnosis between the "Low" and "Moderate" MMSE bands.
- Low vs High: The confidence interval does not contain zero, suggesting a significant difference between the "Low" and "High" MMSE bands for AD diagnosis. The "Low" band is likely associated with a higher risk of AD compared to the "High" band.
- Moderate vs High: The confidence interval does not contain zero, implying a significant difference between the "Moderate" and "High" MMSE bands in terms of AD diagnosis. The "Moderate" band is probably associated with a higher risk of AD compared to the "High" band.


In summary, lower MMSE scores, particularly in the "Very Low" and "Low" bands, appear to be associated with a higher risk of Alzheimer's disease compared to higher MMSE scores in the "Moderate" and "High" bands.

### Hypothesis Testing

In [124]:
data = pd.read_csv('data/gene_prot_data.csv')

In [125]:
import pandas as pd
from scipy import stats

data_no_key = data.drop(columns=['Subject_ID'])
ad = data_no_key.query('Diagnosis == 0')
ctl = data_no_key.query('Diagnosis == 1')
# Perform an independent t-test 
t_stat, p_val = stats.ttest_ind(ad, ctl, axis=0)

significant_features = []
# Iterate over the T-statistics and P-values to print out results for each comparison
for i, (t, p) in enumerate(zip(t_stat, p_val)):
    # print(f"Comparison {i}: T-statistic = {t}, P-value = {p}")
    if p < 0.05:
        print(f"  -> Significant difference detected at comparison {i}")
        print(f"Feature {data_no_key.columns[i]} is significantly different between AD and CTL groups")
        significant_features.append(data_no_key.columns[i])


  -> Significant difference detected at comparison 0
Feature Diagnosis is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 1
Feature Age is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 2
Feature APOE is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 3
Feature MMSE is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 12
Feature PC8 is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 19
Feature P01033 is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 24
Feature P48061 is significantly different between AD and CTL groups
  -> Significant difference detected at comparison 26
Feature P35916 is significantly different between AD and CTL groups
  -> Significant difference detecte

  t_stat, p_val = stats.ttest_ind(ad, ctl, axis=0)


In [126]:
print(f"Significant features: {len(significant_features)}")

Significant features: 5230
