In [1]:
import csv
import numpy as np
import random

import pandas as pd
from ast import literal_eval

import sklearn
from sklearn.linear_model import LogisticRegression
import scipy

import statsmodels.api as sm
import patsy 


# UCSD Data 

In [2]:
# import cleaned fluency data
# original data (before cleaning) downloaded from https://osf.io/j6qea/ (Zemla & Austerweil, 2019)
all_ucsd_lists_table = pd.read_csv(
    "/Users/sam/neuroecon/MBC/fluency_data/UCSD_Zemla_Data/ucsd_fluency_cleaned.csv", 
    converters={'item': eval})
all_ucsd_lists_table.head()

Unnamed: 0,id,listnum,group,SEX,EDUC,DRS,MMSE,category,rank,item
0,5,4,NC,2,16,144,30,animals,1.0,"[armadillo, horse, cow, goat, pig, rabbit, mou..."
1,6,4,NC,1,19,144,30,animals,1.0,"[mouse, rat, raccoon, dog, cat, elephant, cow,..."
2,93,1,NC,2,14,143,30,animals,1.0,"[dog, cat, ape, elephant, monkey, chimpanzee, ..."
3,93,2,NC,2,14,138,26,animals,1.0,"[owl, rhinoceros, buffalo, horse, dog, cat, el..."
4,93,3,NC,2,14,143,30,animals,1.0,"[dog, rat, camel, horse, cat, ferret, hamster,..."


In [3]:
# check how many items are not in animal category
# animals list adapted from https://gist.github.com/atduskgreg/3cf8ef48cb0d29cf151bedad81553a54#file-animals-txt
with open('animal.txt', 'r') as f:
    animal_list = [line.strip() for line in f]
    
animal_list_lowercase = [str.lower(a) for a in animal_list]

not_in_animal_list = []

for i in range(len(all_ucsd_lists_table)):
    list_i=all_ucsd_lists_table.iloc[i][-1] 
    for animal_j in list_i:
        if animal_j not in animal_list_lowercase:
            not_in_animal_list += [animal_j]
print('if you have the right table, the the following number should be 208:', len(not_in_animal_list))
#**

if you have the right table, the the following number should be 208: 208


In [4]:
# tables split by diagnosis
NC_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'NC']
ProbAD_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'ProbAD']
MCI_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'MCI']
MCI4a_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'MCI-4a']
MCI4b_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'MCI-4b']
MCI4c_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'MCI-4c']
MCI4d_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'MCI-4d']
POSSIBLEAD_table = all_ucsd_lists_table.loc[
    all_ucsd_lists_table['group'] == 'POSSIBLEAD']
all_MCI_table = all_ucsd_lists_table[
    all_ucsd_lists_table['group'].isin(['MCI','MCI-4a','MCI-4b','MCI-4c', 'MCI-4d'])]


# IDs split by diagnosis
NC_ids = list(set(list(NC_table['id'])))
ProbAD_ids = list(set(list(ProbAD_table['id'])))
MCI_ids = list(set(list(MCI_table['id'])))
all_MCIs_ids = list(set(
    list(MCI_table['id']) + list(MCI4a_table['id'])
    + list(MCI4b_table['id']) + list(MCI4c_table['id'])
    + list(MCI4d_table['id'])))
POSSIBLEAD_ids = list(set(list(POSSIBLEAD_table['id'])))

#Combined IDs:
NC_and_ProbAD_ids = [i for i in NC_ids if i in ProbAD_ids]
NC_and_MCIs_ids = [i for i in NC_ids if i in all_MCIs_ids]
NC_MCIs_and_ProbAD_ids = [i for i in NC_ids if (i in ProbAD_ids and i in all_MCIs_ids)]
NC_and_POSSIBLEAD_ids = [i for i in NC_ids if i in POSSIBLEAD_ids]
NC_MCIs_POSSIBLEAD_ProbAD_ids = [i for i in NC_and_POSSIBLEAD_ids if i in NC_MCIs_and_ProbAD_ids]

# demographics 

### number of unique IDs (participants) total

In [5]:
# number of participants
len(list(set(list(all_ucsd_lists_table['id']))))

139

In [6]:
# how many subjects have both healthy control & AD diagnosis
print(len(NC_and_ProbAD_ids))

19


### number of unique IDs, by diagnosis

In [7]:
# num of subjects total = num rows 
pivot_byID_list_table = all_ucsd_lists_table.pivot(index='id', columns='listnum', values='item')   


# number of subjects per group
pivot_byGroup_IDcount_table = all_ucsd_lists_table.pivot_table(index='group', 
                                                             aggfunc=lambda x: len(x.dropna().unique()))  
pivot_byGroup_IDcount_table
# id column gives num of subjects per group
diag_table = pivot_byGroup_IDcount_table[['category','id']]

del diag_table['category']
diag_table.rename(columns={'id': 'count'})

Unnamed: 0_level_0,count
group,Unnamed: 1_level_1
AMNESTICSYNDROME,1
ATRISKFORAD,10
ATRISKFORMID,2
DLB,1
FTD,1
ImpairednoMCI,3
LEWYBODYVARNT,4
MCI,8
MCI-4a,2
MCI-4b,3


### number of unique IDs, by sex

In [8]:
Sex1_table = all_ucsd_lists_table.loc[all_ucsd_lists_table['SEX'] == 1]
Sex1_IDs = list(set(list(Sex1_table['id'])))
print(len(Sex1_IDs), 'male')

Sex2_table = all_ucsd_lists_table.loc[all_ucsd_lists_table['SEX'] == 2]
Sex2_IDs = list(set(list(Sex2_table['id'])))
print(len(Sex2_IDs), 'female')

print(len(Sex2_IDs)/(len(Sex1_IDs)+len(Sex2_IDs))*100, '% female')

56 male
83 female
59.71223021582733 % female


### number of fluency lists, by diagnosis

In [9]:
# total number of fluency lists
len(all_ucsd_lists_table)

1167

In [10]:
#  number of fluency lists by diagnostic group
print(len(NC_table), 'Healthy control lists')
print(len(ProbAD_table), 'ProbAD lists')
print(1167-len(NC_table )-len(ProbAD_table), 'other diagnoses lists')

785 Healthy control lists
282 ProbAD lists
100 other diagnoses lists


### MMSE scores, by diagnostic group

In [11]:
def MMSE(table):
    MMSEs = []
    for i in range(len(table)):
        MMSE_i=table.iloc[i]['MMSE']
        MMSEs += [MMSE_i]
    return np.mean(MMSEs),MMSEs

print(MMSE(NC_table)[0], 'mean MMSE for healthy controls') 
print(MMSE(ProbAD_table)[0], 'mean MMSE for ProbAD') 
#print(MMSE(all_ucsd_lists_table)[0], 'mean MMSE overall') 

print('t-test results (healthy controls vs ProbAD):')
print(scipy.stats.ttest_ind(MMSE(NC_table)[1], MMSE(ProbAD_table)[1]))


28.94904458598726 mean MMSE for healthy controls
22.20921985815603 mean MMSE for ProbAD
t-test results (healthy controls vs ProbAD):
Ttest_indResult(statistic=26.33098010046977, pvalue=4.350916004306748e-118)


# analyses

### relevant functions

In [12]:
def dist_from_diag_score(seqx, seqy):
    """
    calculates "distance from diagonal" score for two sequences
    ---
    Parameters:
    seqx: the first sequence (x-axis of the dot plot)
    seqx: the second sequence (y-axis of the dot plot)
    ---
    Returns:
    distance from diagonal score
    """
    seqx_noblank = [e for e in seqx if str(e)!='nan']
    seqy_noblank = [f for f in seqy if str(f)!='nan']
    dotplot = np.array(makeMatrix(seqy_noblank,seqx_noblank,1))
    score = 0
    for j in range(len(dotplot)):
        for k in range(len(dotplot[j])):
            score += dotplot[j][k]*np.abs((j+1)-(k+1))
    return score

In [13]:
def num_intrusions(list_i, category_list_of_items):
    """
    returns number of items in a list that are intrusions 
    (i.e. not in the correct category, which is also given as a list of items)
    ---
    Parameters:
    list_i: the list to check for intrusions
    dictionary_of_items: a list of items not considered in intrusions (considered correct for the category)
    """ 
    intruded_items = []
    for item in list_i:
        if item not in category_list_of_items:  #use animal_list_lowercase as dict
            intruded_items += [item]
    return len(intruded_items)
    
def num_reps(list_i):
    """
    returns number of repeated items in a list 
    ---
    Parameters:
    list_i: the list to check for repeats
    """ 
    return len(list_i)-len(set(list_i))

        
def remove_reps(seq): 
    # order preserving
    """
    removes repeated items from a list, while preserving order 
    ---
    Parameters:
    seq: the list to remove repeats from
    ---
    Returns:
    a list with any repeated instances of any values removed, preserving initial order (and 1st incidence of each item)
    """ 
    checked = []
    for e in seq:
        if e not in checked:
            checked.append(e)
    return checked

In [14]:
# basic dot plot fxns 

def delta(x,y):
    return 1 if x == y and x!= 'nan' and y!='nan' else 0

def M(seq1,seq2,i,j,k):
    return sum(delta(x,y) for x,y in zip(seq1[i:i+k],seq2[j:j+k]))

def makeMatrix(seq1,seq2,k):
    n = len(seq1)
    m = len(seq2)
    return [[M(seq1,seq2,i,j,k) for j in range(m-k+1)] for i in range(n-k+1)]

### preparing table for analysis

In [15]:
# make table with just healthy (normal) controls & ProbAD participants, removing other diagnoses

NC_ProbAD_table = all_ucsd_lists_table[all_ucsd_lists_table['group'].isin(['NC', 'ProbAD'])]

NC_or_ProbAD_diag = []
for i in range(len(NC_ProbAD_table)):
    row_i = NC_ProbAD_table.iloc[i]
    diag_i = NC_ProbAD_table.iloc[i]['group']
    if diag_i == 'NC':
        NC_or_ProbAD_diag += [0]
    elif diag_i == 'ProbAD':
        NC_or_ProbAD_diag += [1]
    else:
        print('error: diagnosis is not NC or ProbAD')

NC_or_ProbAD_listlen = [len(NC_ProbAD_table.iloc[i]['item']) for i in range(len(NC_ProbAD_table))]
NC_ProbAD_table.insert(3, "binary_diag",NC_or_ProbAD_diag) 
NC_ProbAD_table.insert(len(NC_ProbAD_table.columns), "listlen",NC_or_ProbAD_listlen) 
NC_ProbAD_table.head()

Unnamed: 0,id,listnum,group,binary_diag,SEX,EDUC,DRS,MMSE,category,rank,item,listlen
0,5,4,NC,0,2,16,144,30,animals,1.0,"[armadillo, horse, cow, goat, pig, rabbit, mou...",19
1,6,4,NC,0,1,19,144,30,animals,1.0,"[mouse, rat, raccoon, dog, cat, elephant, cow,...",22
2,93,1,NC,0,2,14,143,30,animals,1.0,"[dog, cat, ape, elephant, monkey, chimpanzee, ...",20
3,93,2,NC,0,2,14,138,26,animals,1.0,"[owl, rhinoceros, buffalo, horse, dog, cat, el...",19
4,93,3,NC,0,2,14,143,30,animals,1.0,"[dog, rat, camel, horse, cat, ferret, hamster,...",19


In [16]:
# taking only the first two lists provided for everyone
# --> one entry for everyone, except if in both diagnotic groups, then get one entry per diagnosis

NC_ProbAD_table_pairedlists_rows5 = []

NC_ProbAD_table5 = NC_ProbAD_table.copy()
firstpair_done=False

for i in range(len(NC_ProbAD_table5)):
    if i<len(NC_ProbAD_table5)-1:
        row_i = NC_ProbAD_table5.iloc[i]
        next_row = NC_ProbAD_table5.iloc[i+1]
        
        id_i = row_i['id']
        id_next = next_row['id']
        
        bdiag_i = row_i['binary_diag']
        bdiag_next = next_row['binary_diag']
        
        if id_i==id_next  and  bdiag_i==bdiag_next  and  firstpair_done==False:

            list_i = row_i['item']
            list_next = next_row['item']
            
            len_i = len(list_i)
            len_next = len(list_next)
            avg_len = np.mean([len_i, len_next])
            len_diff = len_i-len_next
            
            num_intrusions_i = num_intrusions(list_i, animal_list_lowercase)
            num_intrusions_next = num_intrusions(list_next, animal_list_lowercase)
            prop_intrusions_i = num_intrusions_i/len(list_i)
            prop_intrusions_next = num_intrusions_next/len(list_next)

            num_reps_i = num_reps(list_i)
            num_reps_next = num_reps(list_next)
            prop_reps_i = num_reps_i/len(list_i)
            prop_reps_next = num_reps_next/len(list_next)
            
            reps_removed_i = remove_reps(list_i)
            reps_removed_next = remove_reps(list_next)
            
            avg_len_reps_removed = np.mean([len(reps_removed_i), len(reps_removed_next)])            
            
            dd_score = dist_from_diag_score(list_i, list_next)
            dd_score_reps_removed = dist_from_diag_score(reps_removed_i, reps_removed_next)
            binary_diag = bdiag_i
            rand = random.randint(0,1)
            pairedlist_table_row = [id_i, row_i['SEX'], row_i['EDUC'], 
                                    np.mean([row_i['DRS'], next_row['DRS']]),np.mean([row_i['MMSE'], next_row['MMSE']]),
                                    dd_score, np.sqrt(dd_score), dd_score**2, 
                                        avg_len, len_diff, 
                                        np.mean([prop_intrusions_i,prop_intrusions_next]), 
                                        np.mean([prop_reps_i, prop_reps_next]),
                                    len(reps_removed_i), len(reps_removed_next),
                                        avg_len_reps_removed,
                                        dd_score_reps_removed, np.sqrt(dd_score_reps_removed),
                                        binary_diag, rand]
            NC_ProbAD_table_pairedlists_rows5.append(pairedlist_table_row)
            firstpair_done=True 
        
        elif id_i!=id_next  or  bdiag_i!=bdiag_next:
            firstpair_done=False


                
NC_ProbAD_table_pairedlists5 = pd.DataFrame(NC_ProbAD_table_pairedlists_rows5, 
                                           columns = ['id', 'SEX', 'EDUC',
                                                      'DRS_avg', 'MMSE_avg',
                                                      'dd_score', 'sqrt_dd_score', 'dd_score^2', 
                                                      'avg_listlen','len1-len2', 
                                                      'avg_prop_intrusions', 'avg_prop_reps',
                                                      'len1_noreps', 'len2_noreps',
                                                      'avg_listlen_reps_removed',
                                                      'dd_score_reps_removed', 'sqrt_dd_score_reps_removed',
                                                      'binary_diag', 'rand_diag'])
NC_ProbAD_table_pairedlists5[0:5]


Unnamed: 0,id,SEX,EDUC,DRS_avg,MMSE_avg,dd_score,sqrt_dd_score,dd_score^2,avg_listlen,len1-len2,avg_prop_intrusions,avg_prop_reps,len1_noreps,len2_noreps,avg_listlen_reps_removed,dd_score_reps_removed,sqrt_dd_score_reps_removed,binary_diag,rand_diag
0,93,2,14,140.5,28.0,49,7.0,2401,19.5,1,0.0,0.0,20,19,19.5,49,7.0,0,1
1,603,2,12,143.0,29.5,78,8.831761,6084,22.0,-8,0.0,0.0,18,26,22.0,78,8.831761,0,1
2,610,1,12,136.0,28.5,33,5.744563,1089,16.5,3,0.0,0.066667,18,13,15.5,33,5.744563,0,1
3,617,1,16,136.5,29.0,53,7.28011,2809,20.0,-6,0.0,0.051151,16,22,19.0,46,6.78233,0,0
4,618,2,17,140.5,30.0,71,8.42615,5041,21.0,-4,0.0,0.026316,18,23,20.5,71,8.42615,0,0


In [17]:
# any subject with both diagnoses (healthy control/NC and ProbAD) is removed fully
NC_ProbAD_table_pairedlists5_bothdiagsremoved = NC_ProbAD_table_pairedlists5.copy()

for i in range(len(NC_ProbAD_table_pairedlists5)):
    row_i=NC_ProbAD_table_pairedlists5.iloc[i]
    id_i = row_i['id']
    diag_i = row_i['binary_diag']
    id_table = NC_ProbAD_table_pairedlists5.loc[NC_ProbAD_table_pairedlists5['id'] == id_i]
    if diag_i == 0:
        id_otherdiag = id_table.loc[id_table['binary_diag']==1]
    elif diag_i == 1: 
        id_otherdiag = id_table.loc[id_table['binary_diag']==0]
    if id_otherdiag.empty == False:
        NC_ProbAD_table_pairedlists5_bothdiagsremoved.drop(
            NC_ProbAD_table_pairedlists5_bothdiagsremoved.index[
                NC_ProbAD_table_pairedlists5_bothdiagsremoved['id'] == id_i], inplace = True)

In [27]:
# 117 subjects remain 
print(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved), 'total subjects, either healthy control or ProbAD')
print(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved[
    NC_ProbAD_table_pairedlists5_bothdiagsremoved['binary_diag']==0]), 
      'healthy control')
print(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved[
    NC_ProbAD_table_pairedlists5_bothdiagsremoved['binary_diag']==1]), 
      'ProbAD')

117 total subjects, either healthy control or ProbAD
77 healthy control
40 ProbAD


In [19]:
NC_ProbAD_table_pairedlists5_bothdiagsremoved[0:5]


Unnamed: 0,id,SEX,EDUC,DRS_avg,MMSE_avg,dd_score,sqrt_dd_score,dd_score^2,avg_listlen,len1-len2,avg_prop_intrusions,avg_prop_reps,len1_noreps,len2_noreps,avg_listlen_reps_removed,dd_score_reps_removed,sqrt_dd_score_reps_removed,binary_diag,rand_diag
0,93,2,14,140.5,28.0,49,7.0,2401,19.5,1,0.0,0.0,20,19,19.5,49,7.0,0,1
1,603,2,12,143.0,29.5,78,8.831761,6084,22.0,-8,0.0,0.0,18,26,22.0,78,8.831761,0,1
2,610,1,12,136.0,28.5,33,5.744563,1089,16.5,3,0.0,0.066667,18,13,15.5,33,5.744563,0,1
3,617,1,16,136.5,29.0,53,7.28011,2809,20.0,-6,0.0,0.051151,16,22,19.0,46,6.78233,0,0
4,618,2,17,140.5,30.0,71,8.42615,5041,21.0,-4,0.0,0.026316,18,23,20.5,71,8.42615,0,0


## comparing logistic regression models: likelihood ratio (LR) test

In [20]:
# model 1 (m1): predicting diagnosis based on
 # (1) avg_listlen_reps_removed: average length of the list without repeats & 
 # (2) avg_prop_reps: average proportion of repeats (number of repeats / length of list)
    
y, X = patsy.dmatrices('binary_diag ~ avg_listlen_reps_removed * avg_prop_reps', 
                       NC_ProbAD_table_pairedlists5_bothdiagsremoved, 
                       return_type = 'dataframe')
model = sm.Logit(y,X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.272075
         Iterations 9


0,1,2,3
Dep. Variable:,binary_diag,No. Observations:,117.0
Model:,Logit,Df Residuals:,113.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 29 Sep 2021",Pseudo R-squ.:,0.5764
Time:,16:12:24,Log-Likelihood:,-31.833
converged:,True,LL-Null:,-75.146
Covariance Type:,nonrobust,LLR p-value:,1.161e-18

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.0778,2.545,3.174,0.002,3.090,13.066
avg_listlen_reps_removed,-0.5966,0.163,-3.655,0.000,-0.917,-0.277
avg_prop_reps,10.7514,50.891,0.211,0.833,-88.993,110.495
avg_listlen_reps_removed:avg_prop_reps,0.8240,3.263,0.253,0.801,-5.571,7.219


In [21]:
# model 2 (m2): predicting diagnosis based on
 # (1) avg_listlen_reps_removed: average length of the lists without repeats & 
 # (2) avg_prop_reps: average proportion of repeats (number of repeats / length of list)
 # (3) dd_score_reps_removed: distance from diagonal score of the lists without repeats 

y, X = patsy.dmatrices('binary_diag ~ avg_listlen_reps_removed * avg_prop_reps * dd_score_reps_removed', 
                       NC_ProbAD_table_pairedlists5_bothdiagsremoved, 
                       return_type = 'dataframe')

model = sm.Logit(y,X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.212517
         Iterations 11


0,1,2,3
Dep. Variable:,binary_diag,No. Observations:,117.0
Model:,Logit,Df Residuals:,109.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 29 Sep 2021",Pseudo R-squ.:,0.6691
Time:,16:12:24,Log-Likelihood:,-24.865
converged:,True,LL-Null:,-75.146
Covariance Type:,nonrobust,LLR p-value:,8.253e-19

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.3520,3.790,-0.093,0.926,-7.781,7.077
avg_listlen_reps_removed,0.0660,0.242,0.273,0.785,-0.409,0.541
avg_prop_reps,209.8265,112.835,1.860,0.063,-11.327,430.980
avg_listlen_reps_removed:avg_prop_reps,-13.1528,7.300,-1.802,0.072,-27.460,1.155
dd_score_reps_removed,0.5126,0.319,1.607,0.108,-0.113,1.138
avg_listlen_reps_removed:dd_score_reps_removed,-0.0390,0.020,-1.931,0.054,-0.079,0.001
avg_prop_reps:dd_score_reps_removed,-8.7475,4.818,-1.816,0.069,-18.190,0.695
avg_listlen_reps_removed:avg_prop_reps:dd_score_reps_removed,0.6159,0.290,2.120,0.034,0.047,1.185


In [22]:
# The likelihood ratio test:
    # LR = 2*(LL(m2)-LL(m1))
LR= 2*(-24.865-(-31.833))
df=7-3
print('LR:',LR) # likelihood ratio test statistic is 13.936 (distributed chi-squared)
print(df)

from scipy.stats.distributions import chi2
p = chi2.sf(LR,df) 
print('p <',round(p,4)) #p<0.01


LR: 13.936
4
p < 0.0075


## comparing means of metrics between controls & ProbAD (t-tests)

### distance from diagonal (dd) score

In [23]:
NC_dd_scores=[]
AD_dd_scores=[]
for i in range(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved)):
    row=NC_ProbAD_table_pairedlists5_bothdiagsremoved.iloc[i]
    if row['binary_diag']==0:
        NC_dd_scores+=[row['dd_score_reps_removed']]
    elif row['binary_diag']==1:
        AD_dd_scores+=[row['dd_score_reps_removed']]
        
print('mean control score:', np.mean(NC_dd_scores))
print('mean ProbAD score:', np.mean(AD_dd_scores))
# print(len(NC_dd_scores))
# print(len(AD_dd_scores))
print('t-test:')
print(scipy.stats.ttest_ind(NC_dd_scores,AD_dd_scores))

mean control score: 43.532467532467535
mean ProbAD score: 14.525
t-test:
Ttest_indResult(statistic=6.281387042066488, pvalue=6.175142636270085e-09)


### average length of list score (with repeats removed)

In [24]:
# avg listlen (reps removed); for list pairs #**??
NC_avg_listlen=[]
AD_avg_listlen=[]
for i in range(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved)):
    row=NC_ProbAD_table_pairedlists5_bothdiagsremoved.iloc[i]
    if row['binary_diag']==0:
        NC_avg_listlen+=[row['avg_listlen_reps_removed']]
    elif row['binary_diag']==1:
        AD_avg_listlen+=[row['avg_listlen_reps_removed']]
        
print('mean control score:', np.mean(NC_avg_listlen))
print('mean ProbAD score:', np.mean(AD_avg_listlen))
# print(len(NC_avg_listlen))
# print(len(AD_avg_listlen))
print('t-test:')
print(scipy.stats.ttest_ind(NC_avg_listlen, AD_avg_listlen))

mean control score: 19.551948051948052
mean ProbAD score: 12.5375
t-test:
Ttest_indResult(statistic=10.3855671836602, pvalue=3.181543813024645e-18)


In [25]:
# individual listlens (reps removed); not paired

NC_avg_listlen=[]
AD_avg_listlen=[]
for i in range(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved)):
    row=NC_ProbAD_table_pairedlists5_bothdiagsremoved.iloc[i]
    if row['binary_diag']==0:
        NC_avg_listlen+=[row['len1_noreps']]
        NC_avg_listlen+=[row['len2_noreps']]
    elif row['binary_diag']==1:
        AD_avg_listlen+=[row['len1_noreps']]
        AD_avg_listlen+=[row['len2_noreps']]

print('mean control score:', np.mean(NC_avg_listlen))
print('mean ProbAD score:', np.mean(AD_avg_listlen))
print(np.std(NC_avg_listlen))
print(np.std(AD_avg_listlen))
# print(len(NC_avg_listlen))
# print(len(AD_avg_listlen))
print('t-test:')
print(scipy.stats.ttest_ind(NC_avg_listlen, AD_avg_listlen, equal_var=False))

mean control score: 19.551948051948052
mean ProbAD score: 12.5375
3.8997872199567203
4.092504581549054
t-test:
Ttest_indResult(statistic=12.569798578739801, pvalue=2.41461017137438e-25)


### repeat score: proportion of items in a list that are repeated 

In [26]:
NC_avg_prop_reps=[]
AD_avg_prop_reps=[]
for i in range(len(NC_ProbAD_table_pairedlists5_bothdiagsremoved)):
    row=NC_ProbAD_table_pairedlists5_bothdiagsremoved.iloc[i]
    if row['binary_diag']==0:
        NC_avg_prop_reps+=[row['avg_prop_reps']]
    elif row['binary_diag']==1:
        AD_avg_prop_reps+=[row['avg_prop_reps']]
        
print('mean control score:', np.mean(NC_avg_prop_reps))
print('mean ProbAD score:', np.mean(AD_avg_prop_reps))
print('t-test:')
print(scipy.stats.ttest_ind(NC_avg_prop_reps, AD_avg_prop_reps, equal_var=False))

mean control score: 0.019873185499442378
mean ProbAD score: 0.09032900478565271
t-test:
Ttest_indResult(statistic=-5.034902544868928, pvalue=9.007742547492986e-06)


# 