In [1]:
import numpy as np
import pandas as pd
import scipy
import os
from sklearn import metrics
from matplotlib import pyplot as plt
import matplotlib as mpl
import pickle
import seaborn as sns
import matplotlib
%matplotlib inline

from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
sns.set(style="whitegrid")


In [2]:
dset = "2yrprev_within3"

In [3]:
# NOTE: the categorical ones are NOT the one-hot encoded version for the model, but the raw versions from before standardization

cognitive_features = ['cts_animals', 'cts_bname', 'cts_catflu','cts_db', 'cts_delay', 'cts_df', 'cts_doperf', 'cts_ebdr', 'cts_ebmt',\
            'cts_fruits', 'cts_idea', 'cts_lopair', 'cts_mmse30', 'cts_nccrtd','cts_pmat', 'cts_pmsub', 'cts_read_nart', \
            'cts_sdmt', 'cts_story', 'cts_stroop_cname', 'cts_stroop_wread', 'cts_wli', 'cts_wlii', 'cts_wliii']
medical_features_sums = ['med_con_sum_cum', 'vasc_3dis_sum', 'vasc_risks_sum']
continuous_demographics = ['age_at_visit', 'educ']

composite_vars = {
    "cogn_ep": ["cts_wli", "cts_wlii", "cts_wliii", "cts_ebmt", "cts_ebdr",  "cts_story","cts_delay"],
    "cogn_po": ["cts_lopair", "cts_pmat"],
    "cogn_ps": ["cts_sdmt", "cts_nccrtd", "cts_stroop_cname", "cts_stroop_wread"],
    "cogn_se":  ["cts_bname", "cts_animals", "cts_fruits","cts_catflu", "cts_read_nart"],
    "cogn_wo": ["cts_db", "cts_df", "cts_doperf"],
    "cogn_global":  ["cts_wli", "cts_wlii", "cts_wliii", "cts_ebmt", "cts_ebdr",  "cts_story","cts_delay",\
                     "cts_lopair", "cts_pmat", "cts_sdmt", "cts_nccrtd", "cts_stroop_cname", "cts_stroop_wread",
                     "cts_bname", "cts_catflu", "cts_read_nart", "cts_db", "cts_df", "cts_doperf"] }
    
# these can stay as is
binary = ['hypertension_cum', 'cancer_cum','diabetes_sr_rx', 'dm_cum', 'headinjrloc_cum', 'lostcons',\
                         'thyroid_cum', 'chf_cum', 'claudication_cum', 'heart_cum', 'stroke_cum', "msex", "spanish"]


# these need to be one hot encoded
categorical = ['apoe_4count', 'race', 'dcfdx']


In [4]:
load_data = pd.read_csv("../DATA/PROCESSED/standardized/merged_data_all_%s.csv"%dset, index_col=0)

sample_info = ["projid","study","fu_year","scaled_to", "onset_label_time", 'onset_label_time_binary']
data = load_data[sample_info]

In [5]:
feature_names = np.setdiff1d(load_data.columns, sample_info)
features = load_data[feature_names]

# demographics table

In [6]:
orig_data_features = pd.read_csv("../DATA/PROCESSED/merged_kept_data_2yrprev_within3.csv").drop(['Unnamed: 0'], axis=1)
orig_data_features["apoe_4count"] = orig_data_features["apoe_genotype"].apply(lambda x: 0 if x in [22., 23., 33.] else 1 if x in [24., 34.] else 2 if x == 44. else np.nan)


In [7]:
categorical_features =  binary + categorical 
continuous_features = cognitive_features + continuous_demographics + medical_features_sums + list(composite_vars.keys())

demo_feats = ['age_at_visit', 'msex','educ', 'race', 'spanish', 'apoe_4count']
# cog_feats = ['dcfdx', 'cogn_ep', 'cogn_po', 'cogn_ps', 'cogn_se', 'cogn_wo','cogn_global']
cog_feats = ['dcfdx', 'cogn_global']

med_feats = ['dcfdx', 'med_con_sum_cum', 'vasc_3dis_sum', 'vasc_risks_sum', 'cancer_cum','claudication_cum', \
        'diabetes_sr_rx', 'dm_cum', 'headinjrloc_cum', 'heart_cum', 'hypertension_cum','stroke_cum', 'thyroid_cum'] 

In [8]:
t_test_vars = ['age_at_visit', 'educ', 'cogn_ep', 'cogn_po', 'cogn_ps', 'cogn_se', 'cogn_wo','cogn_global']

t_test_vars += ['med_con_sum_cum','vasc_3dis_sum', 'vasc_risks_sum']
#u_test_vars = ['med_con_sum_cum','vasc_3dis_sum', 'vasc_risks_sum']
    
t_test_vars += cognitive_features

chi_test_vars = ['msex', 'race', 'spanish', 'apoe_4count', 'dcfdx', 'cancer_cum','claudication_cum', \
        'diabetes_sr_rx', 'dm_cum', 'headinjrloc_cum', 'heart_cum', 'hypertension_cum','stroke_cum', 'thyroid_cum'] 

In [9]:
FEATURE_GROUPS = [("Demographics", demo_feats), 
                  ("Episodic Memory (EM)", composite_vars["cogn_ep"]),  
                  ("Perceptual Orientation (PO)", composite_vars["cogn_po"]), 
                  ("Perceptual Speed (PS)", composite_vars["cogn_ps"]),  
                  ("Semantic Memory (SM)", composite_vars["cogn_se"]), 
                  ("Working Memory (WM)",  composite_vars["cogn_wo"]), 
                  ("Global Cognition", ["cts_mmse30"]),
                  ("Medical history/lifestyle factors", med_feats),
                  ("Diagnosis", ['onset_label_time_binary'])]

In [10]:
var_names = {
    'cts_animals': "Categorical fluency: animals (1min)",
 'cts_bname': "Boston naming (5min)",
 'cts_catflu': "Categorical fluency (combined)",
 'cts_db': "Digits backward (5min)",
 'cts_delay': "Logical memory II (3min)",
 'cts_df': "Digits forward (5min)",
 'cts_doperf': "Digit ordering (5min)",
 'cts_ebdr': "East boston test: immediate (3min)",
 'cts_ebmt': "East boston test: delayed (3min)",
 'cts_fruits': "Categorical fluency: fruits (1min)" ,
 'cts_idea': "Complex Ideational Material",
 'cts_lopair': "Line orientation (15min)",
 'cts_mmse30': "Mini-mental state exam (5-10min)",
 'cts_nccrtd': "Number comparison (3min)",
 'cts_pmat': "Progressive matrices (20min)",
 'cts_pmsub': "Progressive matrices (subset)",
 'cts_read_nart': "National adult reading test (2min)",
 'cts_sdmt': "Symbol digits modality test (5min)",
 'cts_story' : "Logical memory I (3min)",
 'cts_stroop_cname': "Stroop color naming (3min)",
 'cts_stroop_wread': "Stroop word reading (3min)",
 'cts_wli': "Word list: immediate (1min)",
 'cts_wlii': "Word list: delayed (1min)",
 'cts_wliii': "Word list: recognition (1min)",
    'age_at_visit': "Age",
    'educ': "Years of education",
    'med_con_sum_cum': "Medical conditions sum",
    'vasc_3dis_sum': "Vascular disease burden",
    'vasc_risks_sum': "Vascular disease risk"
}

In [11]:
percentages=True

to_keep_groups = ["Demographics", "Global Cognition", "Medical history/lifestyle factors", "Diagnosis"]
# to_keep_groups = [x[0] for x in FEATURE_GROUPS]
for groupname,feat_group in FEATURE_GROUPS:
    if groupname not in to_keep_groups:
        continue
    
    for feat in feat_group:

        # COMPARING ROS VS MAP
        ROS = orig_data_features[orig_data_features["study"]=="ROS "][feat]
        MAP = orig_data_features[orig_data_features["study"]=="MAP "][feat]
        unique_vals = np.unique(np.hstack([ROS.dropna().values, MAP.dropna().values]))

        if feat in t_test_vars:


            t,p = stats.ttest_ind(ROS.dropna().values,MAP.dropna().values)
            p_stars = "***" if p<.001 else "**" if p <.01 else "*" if p < .05 else ""

            print("%s & $t=%.2f^{%s}$  & $%.2f \pm %.2f$ &  $%.2f \pm %.2f$  \\\\ "%(var_names[feat],t,p_stars, ROS.mean(), ROS.std(), MAP.mean(), MAP.std()))

        else:

            # create dictionary of counts for observed values of feature
            ROS_valcounts = {}
            for i,v in enumerate(ROS.value_counts().index):
                ROS_valcounts[v] = ROS.value_counts().values[i]
            MAP_valcounts = {}
            for i,v in enumerate(MAP.value_counts().index):
                MAP_valcounts[v] = MAP.value_counts().values[i]
            
            for v in unique_vals:
                if v not in ROS_valcounts.keys():
                    ROS_valcounts[v]=0
                if v not in MAP_valcounts.keys():
                    MAP_valcounts[v]=0            

            #get union of all values seen (just in case one of the groups has some 0s for some values)
            all_vals = np.union1d(list(MAP_valcounts.keys()), list(ROS_valcounts.keys()))
            
            # generate contingency table (shape: values observed x groups)
            contingency_table = np.array([[ROS_valcounts[elt], MAP_valcounts[elt]] for elt in all_vals])


            chi2_stat, p, dof, ex = stats.chi2_contingency(contingency_table)
            p_stars = "***" if p<.001 else "**" if p <.01 else "*" if p < .05 else ""

            
            
            if percentages:
                if len(all_vals) < 3:
                    controlfrac =ROS_valcounts[1]/np.sum(list(ROS_valcounts.values())) * 100
                    demfrac =MAP_valcounts[1]/np.sum(list(MAP_valcounts.values())) * 100
                    
                    print("%s & $\chi^2=%.2f^{%s}$  & $%.1f\%%$ &  $%.1f\%%$  \\\\ "%(feat.replace("_", "\_"), chi2_stat,p_stars,controlfrac, demfrac))

                
                else:
                    outcomes_str = "/".join(all_vals.astype(int).astype(str))
                    
                    controlvals = (np.round(contingency_table[:,0]/sum(contingency_table[:,0])*100,1))
                    demvals = (np.round(contingency_table[:,1]/sum(contingency_table[:,1])*100,1))
                    controlvals_str = "/".join(["%s\%%"%x for x in controlvals.astype(str)])
                    demvals_str = "/".join(["%s\%%"%x for x in demvals.astype(str)])
                
                    print("%s ($%s$) & $\chi^2=%.2f^{%s}$  & $%s$ &  $%s$  \\\\ "%(feat.replace("_", "\_"),outcomes_str, chi2_stat,p_stars,controlvals_str, demvals_str))
            else:
                outcomes_str = "/".join(all_vals.astype(int).astype(str))
                controlvals_str = "/".join(contingency_table[:,0].astype(int).astype(str))
                demvals_str = "/".join(contingency_table[:,1].astype(int).astype(str))
                
                print("%s ($%s$) & $\chi^2=%.2f^{%s}$  & $%s$ &  $%s$  \\\\ "%(feat.replace("_", "\_"),outcomes_str, chi2_stat,p_stars,controlvals_str, demvals_str))
    print("\\\\")

Age & $t=-22.33^{***}$  & $79.70 \pm 6.75$ &  $83.00 \pm 7.01$  \\ 
msex & $\chi^2=72.35^{***}$  & $31.9\%$ &  $23.6\%$  \\ 
Years of education & $t=44.74^{***}$  & $18.08 \pm 3.38$ &  $14.92 \pm 3.08$  \\ 
race ($1/2/3/6$) & $\chi^2=24.29^{***}$  & $93.1\%/6.1\%/0.4\%/0.3\%$ &  $95.5\%/4.3\%/0.1\%/0.1\%$  \\ 
spanish & $\chi^2=14.41^{***}$  & $3.7\%$ &  $2.2\%$  \\ 
apoe\_4count ($0/1/2$) & $\chi^2=10.06^{**}$  & $76.4\%/22.2\%/1.4\%$ &  $79.3\%/19.5\%/1.2\%$  \\ 
\\
Mini-mental state exam (5-10min) & $t=8.87^{***}$  & $28.40 \pm 1.77$ &  $28.03 \pm 2.10$  \\ 
\\
dcfdx ($1/2/3$) & $\chi^2=6.16^{*}$  & $80.9\%/18.5\%/0.5\%$ &  $78.8\%/20.7\%/0.5\%$  \\ 
Medical conditions sum & $t=-6.42^{***}$  & $1.64 \pm 1.17$ &  $1.80 \pm 1.16$  \\ 
Vascular disease burden & $t=2.23^{*}$  & $0.48 \pm 0.68$ &  $0.45 \pm 0.67$  \\ 
Vascular disease risk & $t=-26.69^{***}$  & $0.71 \pm 0.75$ &  $1.15 \pm 0.80$  \\ 
cancer\_cum & $\chi^2=1.24^{}$  & $39.4\%$ &  $40.7\%$  \\ 
claudication\_cum & $\chi^2=