In [8]:
import h5py
import numpy as np
from matplotlib import pyplot as plt
import pickle
import matplotlib.patches as mpatches
import pandas as pd
import matplotlib
import matplotlib as mpl
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os
import seaborn as sns
import scipy
from matplotlib import cm
import copy
%matplotlib inline
from scipy import stats
import sys
import plotting_functions
from tsne_plotting_functions import plot_continuous_tsne, plot_categorical_tsne


In [11]:
phenotypes = ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]

# # normed:
path_to_MDAD_labels = "../../DATA/MTL_data/labels.csv"
MDAD_labels = pd.read_csv(path_to_MDAD_labels)

# merged covars:
merged_covars = pd.read_csv('../../DATA/MTL_data/merged_phenotypes_w_apoe.csv')

### Available variables:

- Mayo:   Diagnosis, Tissue, Gender, AgeAtDeath, ApoE, PMI, RIN
- HBTRC:   region (different files), disease, age, gender, pmi, ph, rin
- MSBB array:  region (different files), PMI, pH, Sex, Race, Age, CDR (Braak	NP1	PLQ_Mn	NPrSum	NTrSum)
- ACT:  structure_acronym, gender, race, white, education, autopsyage, anydementia, anyad????, apoe_raw, 
- ROSMAP: cogdx, age_death, educ, msex, race, spanish, apoe_genotype, ad_reagan, 
- MSBB RNA: BrodmannArea, RIN, PMI, RACE, AOD, CDR, SEX, NP.1

In [12]:
def compare_continuous(var, dset, dset_to_compare):
    if var in phenotypes:
        df1 = MDAD_labels[MDAD_labels["filename"]==dset]
        df2 = MDAD_labels[MDAD_labels["filename"]==dset_to_compare]
    else:
        df1 = merged_covars[merged_covars["filename"]==dset]
        df2 = merged_covars[merged_covars["filename"]==dset_to_compare]

   
    vals1 = df1[var].dropna().values.astype(float)
    vals2 = df2[var].dropna().values.astype(float)
    
    annot = dset_to_compare[0]
    if len(vals1)>0 and len(vals2)>0:
        t,p = stats.ttest_ind(vals1,vals2)
        p_stars = annot*3 if p<.001 else annot*2 if p <.01 else annot if p < .05 else ""
    else:
        p_stars = ""
    
    return(p_stars)



def compare_categorical(var, dset, dset_to_compare):

    df1 = merged_covars[merged_covars["filename"]==dset]
    df2 = merged_covars[merged_covars["filename"]==dset_to_compare]

    vals1 = df1[var]
    vals2 = df2[var]
    unique_vals = np.unique(np.hstack([vals1.dropna().values, vals2.dropna().values]))
    
    if len(vals1.dropna().values)==0 or len(vals2.dropna().values)==0:
        return("")

    annot = dset_to_compare[0]
    
    
     # create dictionary of counts for observed values of feature
    valcounts1 = {}
    for i,v in enumerate(vals1.value_counts().index):
        valcounts1[v] = vals1.value_counts().values[i]
    valcounts2 = {}
    for i,v in enumerate(vals2.value_counts().index):
        valcounts2[v] = vals2.value_counts().values[i]

    for v in unique_vals:
        if v not in valcounts1.keys():
            valcounts1[v]=0
        if v not in valcounts2.keys():
            valcounts2[v]=0            

    #get union of all values seen (just in case one of the groups has some 0s for some values)
    all_vals = np.union1d(list(valcounts1.keys()), list(valcounts2.keys()))

    # generate contingency table (shape: values observed x groups)
    contingency_table = np.array([[valcounts1[elt], valcounts2[elt]] for elt in all_vals])

    chi2_stat, p, dof, ex = stats.chi2_contingency(contingency_table)
    p_stars = annot*3 if p<.001 else annot*2 if p <.01 else annot if p < .05 else ""
    
    return p_stars

In [13]:
vars_to_show = {
    "BINARY":["dementia", "sex_m", "race_w"],
    "CATEGORICAL":["addx_to_death_cats", "region", "apoe"],
    "CONTINUOUS":["RIN", "PMI_hours", "age_censored", "edu"],
    "MD-AD_PHENOTYPE": phenotypes
}

In [14]:
for vartype in vars_to_show.keys():

    for var in vars_to_show[vartype]:
        print()
        print(var, end=",")
        for dset, df in merged_covars.groupby("filename"):
            phens_df = pd.read_csv("../../DATA/MTL_data/samples_neuropath_prenorm/%s"%dset, delimiter="\t")

            if vartype=="BINARY":
                print("%.2f"%(np.mean(df[var])*100), end = ' ')
                for other_dset in np.setdiff1d(np.unique(merged_covars["filename"]), dset):
                    print(compare_categorical(var, dset, other_dset), end=' ')
                print("",end=",")

            elif vartype=="CATEGORICAL":
                vals,counts=np.unique(df[var].dropna(), return_counts=True)
                print("(%s) %s"%("/".join(vals.astype(str)), 
                                    "/".join(np.round(counts/(np.sum(counts))*100,1).astype(str))),  end = ' ')
                
                for other_dset in np.setdiff1d(np.unique(merged_covars["filename"]), dset):
                    print(compare_categorical(var, dset, other_dset), end=' ')
                print("",end=",")
            elif vartype=="CONTINUOUS":
                print("%.2f +- %.2f"%(np.nanmean(df[var]), np.nanstd(df[var])), end = ' ')
                
                for other_dset in np.setdiff1d(np.unique(merged_covars["filename"]), dset):
                    print(compare_continuous(var, dset, other_dset), end=' ')
                print("",end=",")
    #         elif vartype=="DUMMIES":
    #             print("%s %.2f +- %.2f"%(var, np.nanmean(df[var]), np.nanstd(df[var])))
            elif vartype=="MD-AD_PHENOTYPE":
                if var in phens_df.columns:
                    print("%.2f +- %.2f (%.2f-%.2f)"%(np.nanmean(phens_df[var]), np.nanstd(phens_df[var]),
                                            np.nanmin(phens_df[var]), np.nanmax(phens_df[var])), end = ' ')
                    for other_dset in np.setdiff1d(np.unique(merged_covars["filename"]), dset):
                        print(compare_continuous(var, dset, other_dset), end=' ')
                    print("",end=",")
                else:
                    print("NA", end = ',')
    print()



dementia,48.19 MMM  ,59.27 AAA RRR ,42.80  MMM ,
sex_m,62.31 MMM RRR ,36.63 AAA  ,37.08 AAA  ,
race_w,97.63 MMM  ,80.09 AAA RRR ,98.52  MMM ,

addx_to_death_cats,(0.0/1.0/2.0/3.0) 60.8/12.8/16.0/10.4   ,()    ,(0.0/1.0/2.0/3.0) 56.1/16.1/17.2/10.7   ,
region,(FWM/HIP/PCx/TCx) 24.6/24.9/24.0/26.4 MMM RRR ,(BM10/BM22/BM36/BM44) 27.8/25.8/22.8/23.7 AAA RRR ,(DLPFC) 100.0 AAA MMM ,
apoe,(23.0/24.0/33.0/34.0/44.0) 8.0/1.3/70.5/18.9/1.3 MMM R ,(22.0/23.0/24.0/33.0/34.0/44.0) 1.2/11.7/0.7/55.0/29.2/2.1 AAA RR ,(22.0/23.0/24.0/33.0/34.0/44.0) 0.9/13.1/2.2/61.1/21.8/0.9 A MM ,

RIN,6.35 +- 1.07 MMM  ,6.84 +- 1.47 AAA  ,nan +- nan   ,
PMI_hours,nan +- nan   ,7.10 +- 5.33   ,7.16 +- 4.84   ,
age_censored,86.97 +- 4.08 MMM  ,83.30 +- 7.52 AAA RRR ,86.55 +- 4.61  MMM ,
edu,14.31 +- 3.13  RRR ,



nan +- nan   ,16.51 +- 3.49 AAA  ,

CERAD,1.49 +- 1.08 (0.00-3.00) M  ,1.66 +- 1.28 (0.00-3.00) A  ,1.61 +- 1.16 (0.00-3.00)   ,
BRAAK,3.45 +- 1.67 (0.00-6.00)   ,3.68 +- 1.86 (0.00-6.00)   ,3.37 +- 1.28 (0.00-6.00)   ,
PLAQUES,NA,8.05 +- 8.79 (0.00-42.00)   ,0.73 +- 0.79 (0.00-4.96)   ,
TANGLES,NA,NA,0.55 +- 0.73 (0.00-6.17)   ,
ABETA_IHC,0.02 +- 0.02 (0.00-0.09)   ,NA,4.72 +- 5.21 (0.00-26.31)   ,
TAU_IHC,0.02 +- 0.03 (0.00-0.11)  RRR ,NA,1.44 +- 5.68 (0.00-89.87) AAA  ,
