In [1]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from scipy import stats
import seaborn as sns
import os
import re
from scipy.stats.stats import pearsonr


# function that removes all redundant entries
def remove_redundant(areas):
    for item in areas:
        first = item[0]
        second = item[1]

        # find the inverse of this    
        for item in areas:
            first_inverse = item[0]
            second_inverse = item[1]

            if first == second_inverse and second == first_inverse:
                areas.remove(item)
                
    return areas

# average all high correlated brain areas that are the same but on an other hemisphere
# store other high correlated areas in a list and return them together with the new brain data
def average_rh_lh(areas, brain_vol):
    
    # make a list of al areas that do not only differ in hemisphere 
    areas_diff = []

    for item in areas:
        first = item[0]
        second = item[1]

        # check if only hemisphere is the difference, if so -> average those columns
        if first[3:] == second[3:]:
            brain_vol.loc[:, (first[3:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, first[3:]]])
            brain_vol = brain_vol.drop(columns = [first, second])
            
        elif first[6:] == second[5:] or first[5:] == second[6:]:
            brain_vol.loc[:, (first[5:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, first[5:]]])
            brain_vol = brain_vol.drop(columns = [first, second])
            
        elif first[6:] == second[6:]:
            brain_vol.loc[:, 'wm_'+ (first[5:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, 'wm_'+ (first[5:])]])
            brain_vol = brain_vol.drop(columns = [first, second])
        else:
            areas_diff.append(item)
            
    return areas_diff, brain_vol

  import pandas.util.testing as tm


In [2]:
# load data BEAST
data = pd.read_excel("BEAST_s_round.xlsx")

# create table with subid's and playerNr
subid = data[['playerNr', 'subjectID']]
subid = subid.dropna(subset = ['subjectID'])

# drop trials without an s_round
data = data.dropna(subset = ['s_round'])

# remove data with s_round below 0 or above 1
data = data[data.s_round >= 0]
data = data[data.s_round <= 1]
data

Unnamed: 0.1,Unnamed: 0,playerNr,period,subjectID,nAnimals,estimate,socialInfo,estimate_revised,time_estimate,time_revise,s_round
0,0,1,1,37,93.0,28.0,34.0,30.0,15.0,24.0,0.333333
1,1,1,2,,78.0,34.0,40.0,34.0,49.0,11.0,0.000000
2,2,1,3,,59.0,42.0,50.0,42.0,20.0,12.0,0.000000
3,3,1,4,,74.0,73.0,88.0,75.0,22.0,18.0,0.133333
4,4,1,5,,69.0,63.0,76.0,63.0,8.0,13.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
1155,1155,246,1,348,93.0,78.0,95.0,85.0,10.0,13.0,0.411765
1156,1156,246,2,,78.0,90.0,72.0,80.0,24.0,11.0,0.555556
1157,1157,246,3,,59.0,67.0,54.0,65.0,8.0,6.0,0.153846
1158,1158,246,4,,74.0,70.0,84.0,77.0,8.0,5.0,0.500000


In [3]:
# calculate difference between number of animals and first estimate
data.loc[:, ('error_estimate')] = data.apply(lambda row: row.nAnimals - row.estimate, axis = 1)
data.loc[:, ('abs_error_estimate')] = data.apply(lambda row: abs(row.nAnimals - row.estimate), axis = 1)
data.loc[:, ('abs_error_estimate_2')] = data.apply(lambda row: abs(row.nAnimals - row.estimate_revised), axis = 1)

data = data.dropna(subset = ['abs_error_estimate', 'abs_error_estimate_2'])
data

Unnamed: 0.1,Unnamed: 0,playerNr,period,subjectID,nAnimals,estimate,socialInfo,estimate_revised,time_estimate,time_revise,s_round,error_estimate,abs_error_estimate,abs_error_estimate_2
0,0,1,1,37,93.0,28.0,34.0,30.0,15.0,24.0,0.333333,65.0,65.0,63.0
1,1,1,2,,78.0,34.0,40.0,34.0,49.0,11.0,0.000000,44.0,44.0,44.0
2,2,1,3,,59.0,42.0,50.0,42.0,20.0,12.0,0.000000,17.0,17.0,17.0
3,3,1,4,,74.0,73.0,88.0,75.0,22.0,18.0,0.133333,1.0,1.0,1.0
4,4,1,5,,69.0,63.0,76.0,63.0,8.0,13.0,0.000000,6.0,6.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,1155,246,1,348,93.0,78.0,95.0,85.0,10.0,13.0,0.411765,15.0,15.0,8.0
1156,1156,246,2,,78.0,90.0,72.0,80.0,24.0,11.0,0.555556,-12.0,12.0,2.0
1157,1157,246,3,,59.0,67.0,54.0,65.0,8.0,6.0,0.153846,-8.0,8.0,6.0
1158,1158,246,4,,74.0,70.0,84.0,77.0,8.0,5.0,0.500000,4.0,4.0,3.0


In [5]:
# set threshold for the maximum reaction time and filter
threshold = 20
thres_20 = data[data.time_estimate < threshold]

# count number of trials of participants 
counter_data = thres_20.groupby(['playerNr']).count()

# set minimum number of trials trial
min_trials = 3

# store playerNr that have a minimum number of trials
included_playerNr = counter_data[counter_data.period >= min_trials]
included_playerNr = included_playerNr.reset_index()[['playerNr']]
included_playerNr

# see what subject IDs match those playerNr
included_subids = included_playerNr.merge(subid).dropna()
included_subids

Unnamed: 0,playerNr,subjectID
0,2,18
1,5,19
2,7,36
3,8,22
4,9,50
...,...,...
199,242,342
200,243,345
201,244,346
202,245,347


In [6]:
# calculate S total
data_S = data.groupby(['playerNr']).mean()
data_S = data_S.reset_index()

# only select including subjects
data_S = included_subids[['playerNr', 'subjectID']].merge(data_S)
data_S = data_S[['subjectID', 's_round', 'abs_error_estimate', 'abs_error_estimate_2']].rename(columns={"s_round" : "S"})
data_S

Unnamed: 0,subjectID,S,abs_error_estimate,abs_error_estimate_2
0,18,0.304371,10.00,7.50
1,19,0.370833,8.60,6.80
2,36,0.333333,28.75,21.25
3,22,0.344339,15.80,14.80
4,50,0.444328,20.00,17.40
...,...,...,...,...
199,342,0.213333,12.60,11.00
200,345,0.117143,13.00,11.60
201,346,0.255082,9.20,6.00
202,347,0.000000,24.60,24.60


In [11]:
# save excel file
#data_S.to_excel("BEAST_clean_abs_error_estimate_2.xlsx")

**ADD BRAIN AND DEMOGRAPHIC DATA**

In [4]:
# load data rh
data = pd.read_excel('norm_SupraTentorialVolNotVent_DK_WM_vol.xlsx').drop(columns = 'Unnamed: 0')

# drop columns that are not brain areas or that we do not need
data = data.drop(columns = ['SupraTentorialVol', 'SupraTentorialVolNotVent'])
data = data.drop(columns = ["wm-lh-bankssts","wm-lh-caudalanteriorcingulate","wm-lh-caudalmiddlefrontal","wm-lh-cuneus","wm-lh-entorhinal","wm-lh-fusiform","wm-lh-inferiorparietal","wm-lh-inferiortemporal","wm-lh-isthmuscingulate","wm-lh-lateraloccipital","wm-lh-lateralorbitofrontal","wm-lh-lingual","wm-lh-medialorbitofrontal","wm-lh-middletemporal","wm-lh-parahippocampal","wm-lh-paracentral","wm-lh-parsopercularis","wm-lh-parsorbitalis","wm-lh-parstriangularis","wm-lh-pericalcarine","wm-lh-postcentral","wm-lh-posteriorcingulate","wm-lh-precentral","wm-lh-precuneus","wm-lh-rostralanteriorcingulate","wm-lh-rostralmiddlefrontal","wm-lh-superiorfrontal","wm-lh-superiorparietal","wm-lh-superiortemporal","wm-lh-supramarginal","wm-lh-frontalpole","wm-lh-temporalpole","wm-lh-transversetemporal","wm-lh-insula","wm-rh-bankssts","wm-rh-caudalanteriorcingulate","wm-rh-caudalmiddlefrontal","wm-rh-cuneus","wm-rh-entorhinal","wm-rh-fusiform","wm-rh-inferiorparietal","wm-rh-inferiortemporal","wm-rh-isthmuscingulate","wm-rh-lateraloccipital","wm-rh-lateralorbitofrontal","wm-rh-lingual","wm-rh-medialorbitofrontal","wm-rh-middletemporal","wm-rh-parahippocampal","wm-rh-paracentral","wm-rh-parsopercularis","wm-rh-parsorbitalis","wm-rh-parstriangularis","wm-rh-pericalcarine","wm-rh-postcentral","wm-rh-posteriorcingulate","wm-rh-precentral","wm-rh-precuneus","wm-rh-rostralanteriorcingulate","wm-rh-rostralmiddlefrontal","wm-rh-superiorfrontal","wm-rh-superiorparietal","wm-rh-superiortemporal","wm-rh-supramarginal","wm-rh-frontalpole","wm-rh-temporalpole","wm-rh-transversetemporal","wm-rh-insula"])
print(len(data.keys()))
print(data.keys())

# load demographic data
demo = pd.read_excel('demographics.xlsx')
demo = demo[['ID', 'gender', 'study', 'age_mri']]

# merge with demo
data = data.merge(demo)
data

90
Index(['ID', 'lh_bankssts_volume', 'lh_caudalanteriorcingulate_volume',
       'lh_caudalmiddlefrontal_volume', 'lh_cuneus_volume',
       'lh_entorhinal_volume', 'lh_fusiform_volume',
       'lh_inferiorparietal_volume', 'lh_inferiortemporal_volume',
       'lh_isthmuscingulate_volume', 'lh_lateraloccipital_volume',
       'lh_lateralorbitofrontal_volume', 'lh_lingual_volume',
       'lh_medialorbitofrontal_volume', 'lh_middletemporal_volume',
       'lh_parahippocampal_volume', 'lh_paracentral_volume',
       'lh_parsopercularis_volume', 'lh_parsorbitalis_volume',
       'lh_parstriangularis_volume', 'lh_pericalcarine_volume',
       'lh_postcentral_volume', 'lh_posteriorcingulate_volume',
       'lh_precentral_volume', 'lh_precuneus_volume',
       'lh_rostralanteriorcingulate_volume', 'lh_rostralmiddlefrontal_volume',
       'lh_superiorfrontal_volume', 'lh_superiorparietal_volume',
       'lh_superiortemporal_volume', 'lh_supramarginal_volume',
       'lh_frontalpole_volume', '

Unnamed: 0,ID,lh_bankssts_volume,lh_caudalanteriorcingulate_volume,lh_caudalmiddlefrontal_volume,lh_cuneus_volume,lh_entorhinal_volume,lh_fusiform_volume,lh_inferiorparietal_volume,lh_inferiortemporal_volume,lh_isthmuscingulate_volume,...,Right-Accumbens-area,Right-VentralDC,CC_Posterior,CC_Mid_Posterior,CC_Central,CC_Mid_Anterior,CC_Anterior,gender,study,age_mri
0,1,0.002964,0.002256,0.008136,0.003394,0.002366,0.010357,0.015142,0.011369,0.003066,...,0.000566,0.004580,0.000692,0.000472,0.000384,0.000393,0.000750,F,year 1 KI,19.0
1,4,0.002344,0.001850,0.005794,0.003879,0.002107,0.009607,0.012305,0.009720,0.002845,...,0.000593,0.003867,0.001058,0.000612,0.000702,0.000575,0.000802,F,year 1 KI,23.0
2,5,0.002534,0.002797,0.006291,0.002592,0.002028,0.008790,0.012013,0.012749,0.002957,...,0.000532,0.004620,0.001018,0.000514,0.000468,0.000468,0.000915,F,year 1 KI,19.0
3,8,0.003097,0.001501,0.006358,0.003707,0.001096,0.009258,0.013345,0.010357,0.002457,...,0.000611,0.004016,0.001011,0.000512,0.000524,0.000631,0.000893,M,year 1 KI,18.0
4,12,0.001972,0.002460,0.007522,0.003739,0.001428,0.009609,0.010374,0.008492,0.002886,...,0.000479,0.003734,0.001073,0.000561,0.000450,0.000494,0.000869,F,year 1 Anders,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,341,0.002578,0.001552,0.007039,0.003653,0.002337,0.010593,0.012364,0.010617,0.002702,...,0.000444,0.004446,0.000900,0.000497,0.000431,0.000444,0.000791,M,year 3,22.0
184,342,0.002340,0.001347,0.007087,0.002897,0.002138,0.009919,0.014293,0.011200,0.002929,...,0.000599,0.003765,0.000793,0.000500,0.000440,0.000446,0.000938,M,year 3,20.0
185,344,0.002441,0.002007,0.007249,0.003607,0.001741,0.009982,0.012392,0.011520,0.002999,...,0.000526,0.004238,0.000968,0.000634,0.000527,0.000574,0.000862,M,year 3,21.0
186,345,0.002148,0.002235,0.007215,0.004028,0.001970,0.010835,0.012245,0.010998,0.002869,...,0.000637,0.004363,0.000821,0.000334,0.000338,0.000387,0.000740,M,year 3,22.0


In [5]:
# check for collinearity
corr = data.corr()

# check correlation higher than .7
abs_cor = corr.abs()[corr<1] # remove correlation with same feature (as corr=1)
abs_cor_unstack = abs_cor.unstack()
sorted_cor = abs_cor_unstack.sort_values(kind = 'quicksort') # sort on how high correlation is

# select correlation higher than .7 and make list of those areas
high_cor = sorted_cor[sorted_cor > .7]
areas = high_cor.keys()
areas = areas.tolist()

# remove redundant areas (as every correlation is now in there twice)
areas = remove_redundant(areas)
print("The amount of highly correlated brain area pairs:", len(areas))
areas

The amount of highly correlated brain area pairs: 8


[('rh_precuneus_volume', 'lh_precuneus_volume'),
 ('lh_pericalcarine_volume', 'lh_cuneus_volume'),
 ('CC_Mid_Anterior', 'CC_Central'),
 ('Right-VentralDC', 'Left-VentralDC'),
 ('rh_pericalcarine_volume', 'lh_pericalcarine_volume'),
 ('Right-Hippocampus', 'Left-Hippocampus'),
 ('Left-Putamen', 'Right-Putamen'),
 ('Left-Caudate', 'Right-Caudate')]

In [6]:
# average highly correlated brain regions (CC mid anterior and cc central are not averaged)
averaged = average_rh_lh(areas, data)
av_data = averaged[1]
av_data

Unnamed: 0,ID,lh_bankssts_volume,lh_caudalanteriorcingulate_volume,lh_caudalmiddlefrontal_volume,lh_cuneus_volume,lh_entorhinal_volume,lh_fusiform_volume,lh_inferiorparietal_volume,lh_inferiortemporal_volume,lh_isthmuscingulate_volume,...,CC_Anterior,gender,study,age_mri,precuneus_volume,-VentralDC,pericalcarine_volume,-Hippocampus,Putamen,Caudate
0,1,0.002964,0.002256,0.008136,0.003394,0.002366,0.010357,0.015142,0.011369,0.003066,...,0.000750,F,year 1 KI,19.0,0.010798,0.004498,0.002535,0.004372,0.005117,0.004709
1,4,0.002344,0.001850,0.005794,0.003879,0.002107,0.009607,0.012305,0.009720,0.002845,...,0.000802,F,year 1 KI,23.0,0.009305,0.003978,0.002612,0.003825,0.004903,0.003575
2,5,0.002534,0.002797,0.006291,0.002592,0.002028,0.008790,0.012013,0.012749,0.002957,...,0.000915,F,year 1 KI,19.0,0.010651,0.004404,0.001743,0.004377,0.005053,0.003914
3,8,0.003097,0.001501,0.006358,0.003707,0.001096,0.009258,0.013345,0.010357,0.002457,...,0.000893,M,year 1 KI,18.0,0.010617,0.003842,0.002555,0.003565,0.005305,0.003608
4,12,0.001972,0.002460,0.007522,0.003739,0.001428,0.009609,0.010374,0.008492,0.002886,...,0.000869,F,year 1 Anders,18.0,0.010552,0.003802,0.003304,0.003456,0.004619,0.003857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,341,0.002578,0.001552,0.007039,0.003653,0.002337,0.010593,0.012364,0.010617,0.002702,...,0.000791,M,year 3,22.0,0.010226,0.004725,0.002690,0.004314,0.005037,0.004465
184,342,0.002340,0.001347,0.007087,0.002897,0.002138,0.009919,0.014293,0.011200,0.002929,...,0.000938,M,year 3,20.0,0.010535,0.003965,0.002674,0.003505,0.004694,0.003493
185,344,0.002441,0.002007,0.007249,0.003607,0.001741,0.009982,0.012392,0.011520,0.002999,...,0.000862,M,year 3,21.0,0.009198,0.004387,0.002923,0.003852,0.005311,0.003840
186,345,0.002148,0.002235,0.007215,0.004028,0.001970,0.010835,0.012245,0.010998,0.002869,...,0.000740,M,year 3,22.0,0.010066,0.004288,0.002861,0.003797,0.004984,0.003672


In [7]:
av_data.keys()

Index(['ID', 'lh_bankssts_volume', 'lh_caudalanteriorcingulate_volume',
       'lh_caudalmiddlefrontal_volume', 'lh_cuneus_volume',
       'lh_entorhinal_volume', 'lh_fusiform_volume',
       'lh_inferiorparietal_volume', 'lh_inferiortemporal_volume',
       'lh_isthmuscingulate_volume', 'lh_lateraloccipital_volume',
       'lh_lateralorbitofrontal_volume', 'lh_lingual_volume',
       'lh_medialorbitofrontal_volume', 'lh_middletemporal_volume',
       'lh_parahippocampal_volume', 'lh_paracentral_volume',
       'lh_parsopercularis_volume', 'lh_parsorbitalis_volume',
       'lh_parstriangularis_volume', 'lh_postcentral_volume',
       'lh_posteriorcingulate_volume', 'lh_precentral_volume',
       'lh_rostralanteriorcingulate_volume', 'lh_rostralmiddlefrontal_volume',
       'lh_superiorfrontal_volume', 'lh_superiorparietal_volume',
       'lh_superiortemporal_volume', 'lh_supramarginal_volume',
       'lh_frontalpole_volume', 'lh_temporalpole_volume',
       'lh_transversetemporal_volum

In [15]:
# load beast data and combine with brain data
BEAST = pd.read_excel('BEAST_clean_abs_error_estimate_2.xlsx')
BEAST = BEAST.rename(columns= {"subjectID" : "ID"})
BEAST = BEAST[['ID', 'S', 'abs_error_estimate' , 'abs_error_estimate_2']]
merge_data = av_data.merge(BEAST)
merge_data

# remove age outliers
upperbound = merge_data.age_mri.mean() + 2 * merge_data.age_mri.std()
print("removing age from:", upperbound)
merge_data_clean = merge_data[merge_data.age_mri < upperbound]
print("number of participants left:", len(merge_data_clean))

removing age from: 29.029747243067103
number of participants left: 159


In [16]:
s_use = merge_data_clean.copy()

# make a binary feature if the participant uses social information or not
s_use['s_use'] = s_use.apply(lambda row: 1 if row.S > 0 else 0, axis = 1)
print("number of pp s_use = 1 : ", len(s_use[s_use.s_use == 1 ]))
print("number of pp s_use = 0 : ", len(s_use[s_use.s_use == 0 ]))

# make dummie of gender
dummy_data = pd.get_dummies(s_use, prefix=['gender'], columns=['gender'])

# drop study column (not useful)
dummy_data = dummy_data.drop(columns = 'study')
print("number of males: ", len(dummy_data[dummy_data.gender_M == 1]))
print("number of females: ", len(dummy_data[dummy_data.gender_F == 1]))
print("number of males s_use: ", len(dummy_data[(dummy_data.gender_M == 1) & (dummy_data.s_use == 0)]))
print("number of females s_use: ", len(dummy_data[(dummy_data.gender_F == 1) & (dummy_data.s_use == 0)]))

number of pp s_use = 1 :  141
number of pp s_use = 0 :  18
number of males:  76
number of females:  83
number of males s_use:  10
number of females s_use:  8


In [17]:
# exclude participants who never used social information
s_data = dummy_data[dummy_data.S > 0].drop(columns = ['gender_M', 'ID', 's_use', 'abs_error_estimate_2'])
s_data

Unnamed: 0,lh_bankssts_volume,lh_caudalanteriorcingulate_volume,lh_caudalmiddlefrontal_volume,lh_cuneus_volume,lh_entorhinal_volume,lh_fusiform_volume,lh_inferiorparietal_volume,lh_inferiortemporal_volume,lh_isthmuscingulate_volume,lh_lateraloccipital_volume,...,precuneus_volume,-VentralDC,pericalcarine_volume,-Hippocampus,Putamen,Caudate,S,abs_error_estimate,abs_error_estimate_2,gender_F
0,0.002964,0.002256,0.008136,0.003394,0.002366,0.010357,0.015142,0.011369,0.003066,0.012157,...,0.010798,0.004498,0.002535,0.004372,0.005117,0.004709,0.071429,13.80,12.8,1
1,0.002344,0.001850,0.005794,0.003879,0.002107,0.009607,0.012305,0.009720,0.002845,0.013617,...,0.009305,0.003978,0.002612,0.003825,0.004903,0.003575,0.468056,23.75,19.0,1
2,0.002534,0.002797,0.006291,0.002592,0.002028,0.008790,0.012013,0.012749,0.002957,0.012521,...,0.010651,0.004404,0.001743,0.004377,0.005053,0.003914,0.370714,28.60,25.4,1
3,0.003097,0.001501,0.006358,0.003707,0.001096,0.009258,0.013345,0.010357,0.002457,0.015477,...,0.010617,0.003842,0.002555,0.003565,0.005305,0.003608,0.047222,33.80,33.4,0
4,0.001972,0.002460,0.007522,0.003739,0.001428,0.009609,0.010374,0.008492,0.002886,0.013569,...,0.010552,0.003802,0.003304,0.003456,0.004619,0.003857,0.400928,14.60,13.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.002753,0.002395,0.005985,0.002600,0.001503,0.010023,0.012704,0.012537,0.002785,0.013129,...,0.009620,0.004608,0.002107,0.004487,0.006393,0.004437,0.190540,10.80,8.2,1
158,0.002515,0.001864,0.007071,0.003184,0.002057,0.009190,0.013648,0.012823,0.003270,0.010043,...,0.011233,0.003766,0.002369,0.003947,0.004982,0.003704,0.227739,19.80,17.2,0
159,0.002578,0.001552,0.007039,0.003653,0.002337,0.010593,0.012364,0.010617,0.002702,0.014111,...,0.010226,0.004725,0.002690,0.004314,0.005037,0.004465,0.314423,22.00,18.6,0
160,0.002340,0.001347,0.007087,0.002897,0.002138,0.009919,0.014293,0.011200,0.002929,0.012897,...,0.010535,0.003965,0.002674,0.003505,0.004694,0.003493,0.213333,12.60,11.0,0


In [26]:
# save data to excel
s_data.to_excel("model_data_S_not_zero_absERROR_NotVent.xlsx")

In [20]:
# save data to excel
data_allS = dummy_data.drop(columns = ['gender_M', 'ID', 's_use', 'abs_error_estimate_2'])
data_allS.to_excel("model_data_allS_absERROR_NotVent.xlsx")