In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from scipy import stats
import seaborn as sns
import os
import re
from scipy.stats.stats import pearsonr


# function that removes all redundant entries
def remove_redundant(areas):
    for item in areas:
        first = item[0]
        second = item[1]

        # find the inverse of this    
        for item in areas:
            first_inverse = item[0]
            second_inverse = item[1]

            if first == second_inverse and second == first_inverse:
                areas.remove(item)
                
    return areas

# average all high correlated brain areas that are the same but on an other hemisphere
# store other high correlated areas in a list and return them together with the new brain data
def average_rh_lh(areas, brain_vol):
    
    # make a list of al areas that do not only differ in hemisphere 
    areas_diff = []

    for item in areas:
        first = item[0]
        second = item[1]

        # check if only hemisphere is the difference, if so -> average those columns
        if first[3:] == second[3:]:
            brain_vol.loc[:, (first[3:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, first[3:]]])
            brain_vol = brain_vol.drop(columns = [first, second])
            
        elif first[6:] == second[5:] or first[5:] == second[6:]:
            brain_vol.loc[:, (first[5:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, first[5:]]])
            brain_vol = brain_vol.drop(columns = [first, second])
            
        elif first[6:] == second[6:]:
            brain_vol.loc[:, 'wm_'+ (first[5:])] = brain_vol.apply(lambda row: (row[first] + row[second])/2, axis = 1)
#             print("Brain pair combined to one:", brain_vol[[first, second, 'wm_'+ (first[5:])]])
            brain_vol = brain_vol.drop(columns = [first, second])
        else:
            areas_diff.append(item)
            
    return areas_diff, brain_vol

In [None]:
# load data BEAST
data = pd.read_excel("BEAST_s_round.xlsx")

# create table with subid's and playerNr
subid = data[['playerNr', 'subjectID']]
subid = subid.dropna(subset = ['subjectID'])

# drop trials without an s_round
data = data.dropna(subset = ['s_round'])

# remove data with s_round below 0 or above 1
data = data[data.s_round >= 0]
data = data[data.s_round <= 1]
data

In [None]:
# calculate difference between number of animals and first estimate
data.loc[:, ('error_estimate')] = data.apply(lambda row: row.nAnimals - row.estimate, axis = 1)
data.loc[:, ('abs_error_estimate')] = data.apply(lambda row: abs(row.nAnimals - row.estimate), axis = 1)
data.loc[:, ('abs_error_estimate_2')] = data.apply(lambda row: abs(row.nAnimals - row.estimate_revised), axis = 1)

data = data.dropna(subset = ['abs_error_estimate', 'abs_error_estimate_2'])
data

In [None]:
# set threshold for the maximum reaction time and filter
threshold = 20
thres_20 = data[data.time_estimate < threshold]

# count number of trials of participants 
counter_data = thres_20.groupby(['playerNr']).count()

# set minimum number of trials trial
min_trials = 3

# store playerNr that have a minimum number of trials
included_playerNr = counter_data[counter_data.period >= min_trials]
included_playerNr = included_playerNr.reset_index()[['playerNr']]
included_playerNr

# see what subject IDs match those playerNr
included_subids = included_playerNr.merge(subid).dropna()
included_subids

In [None]:
# calculate S total
data_S = data.groupby(['playerNr']).mean()
data_S = data_S.reset_index()

# only select including subjects
data_S = included_subids[['playerNr', 'subjectID']].merge(data_S)
data_S = data_S[['subjectID', 's_round', 'abs_error_estimate', 'abs_error_estimate_2']].rename(columns={"s_round" : "S"})
data_S

In [None]:
# save excel file
#data_S.to_excel("BEAST_clean_abs_error_estimate_2.xlsx")

**ADD BRAIN AND DEMOGRAPHIC DATA**

In [None]:
# load data rh
data = pd.read_excel('norm_SupraTentorialVolNotVent_DK_WM_vol.xlsx').drop(columns = 'Unnamed: 0')

# drop columns that are not brain areas or that we do not need
data = data.drop(columns = ['SupraTentorialVol', 'SupraTentorialVolNotVent'])
data = data.drop(columns = ["wm-lh-bankssts","wm-lh-caudalanteriorcingulate","wm-lh-caudalmiddlefrontal","wm-lh-cuneus","wm-lh-entorhinal","wm-lh-fusiform","wm-lh-inferiorparietal","wm-lh-inferiortemporal","wm-lh-isthmuscingulate","wm-lh-lateraloccipital","wm-lh-lateralorbitofrontal","wm-lh-lingual","wm-lh-medialorbitofrontal","wm-lh-middletemporal","wm-lh-parahippocampal","wm-lh-paracentral","wm-lh-parsopercularis","wm-lh-parsorbitalis","wm-lh-parstriangularis","wm-lh-pericalcarine","wm-lh-postcentral","wm-lh-posteriorcingulate","wm-lh-precentral","wm-lh-precuneus","wm-lh-rostralanteriorcingulate","wm-lh-rostralmiddlefrontal","wm-lh-superiorfrontal","wm-lh-superiorparietal","wm-lh-superiortemporal","wm-lh-supramarginal","wm-lh-frontalpole","wm-lh-temporalpole","wm-lh-transversetemporal","wm-lh-insula","wm-rh-bankssts","wm-rh-caudalanteriorcingulate","wm-rh-caudalmiddlefrontal","wm-rh-cuneus","wm-rh-entorhinal","wm-rh-fusiform","wm-rh-inferiorparietal","wm-rh-inferiortemporal","wm-rh-isthmuscingulate","wm-rh-lateraloccipital","wm-rh-lateralorbitofrontal","wm-rh-lingual","wm-rh-medialorbitofrontal","wm-rh-middletemporal","wm-rh-parahippocampal","wm-rh-paracentral","wm-rh-parsopercularis","wm-rh-parsorbitalis","wm-rh-parstriangularis","wm-rh-pericalcarine","wm-rh-postcentral","wm-rh-posteriorcingulate","wm-rh-precentral","wm-rh-precuneus","wm-rh-rostralanteriorcingulate","wm-rh-rostralmiddlefrontal","wm-rh-superiorfrontal","wm-rh-superiorparietal","wm-rh-superiortemporal","wm-rh-supramarginal","wm-rh-frontalpole","wm-rh-temporalpole","wm-rh-transversetemporal","wm-rh-insula"])
print(len(data.keys()))
print(data.keys())

# load demographic data
demo = pd.read_excel('demographics.xlsx')
demo = demo[['ID', 'gender', 'study', 'age_mri']]

# merge with demo
data = data.merge(demo)
data

In [None]:
# check for collinearity
corr = data.corr()

# check correlation higher than .7
abs_cor = corr.abs()[corr<1] # remove correlation with same feature (as corr=1)
abs_cor_unstack = abs_cor.unstack()
sorted_cor = abs_cor_unstack.sort_values(kind = 'quicksort') # sort on how high correlation is

# select correlation higher than .7 and make list of those areas
high_cor = sorted_cor[sorted_cor > .7]
areas = high_cor.keys()
areas = areas.tolist()

# remove redundant areas (as every correlation is now in there twice)
areas = remove_redundant(areas)
print("The amount of highly correlated brain area pairs:", len(areas))
areas

In [None]:
# average highly correlated brain regions (CC mid anterior and cc central are not averaged)
averaged = average_rh_lh(areas, data)
av_data = averaged[1]
av_data

In [None]:
av_data.keys()

In [None]:
# load beast data and combine with brain data
BEAST = pd.read_excel('BEAST_clean_abs_error_estimate_2.xlsx')
BEAST = BEAST.rename(columns= {"subjectID" : "ID"})
BEAST = BEAST[['ID', 'S', 'abs_error_estimate' , 'abs_error_estimate_2']]
merge_data = av_data.merge(BEAST)
merge_data

# remove age outliers
upperbound = merge_data.age_mri.mean() + 2 * merge_data.age_mri.std()
print("removing age from:", upperbound)
merge_data_clean = merge_data[merge_data.age_mri < upperbound]
print("number of participants left:", len(merge_data_clean))

In [None]:
s_use = merge_data_clean.copy()

# make a binary feature if the participant uses social information or not
s_use['s_use'] = s_use.apply(lambda row: 1 if row.S > 0 else 0, axis = 1)
print("number of pp s_use = 1 : ", len(s_use[s_use.s_use == 1 ]))
print("number of pp s_use = 0 : ", len(s_use[s_use.s_use == 0 ]))

# make dummie of gender
dummy_data = pd.get_dummies(s_use, prefix=['gender'], columns=['gender'])

# drop study column (not useful)
dummy_data = dummy_data.drop(columns = 'study')
print("number of males: ", len(dummy_data[dummy_data.gender_M == 1]))
print("number of females: ", len(dummy_data[dummy_data.gender_F == 1]))
print("number of males s_use: ", len(dummy_data[(dummy_data.gender_M == 1) & (dummy_data.s_use == 0)]))
print("number of females s_use: ", len(dummy_data[(dummy_data.gender_F == 1) & (dummy_data.s_use == 0)]))

In [None]:
# exclude participants who never used social information
s_data = dummy_data[dummy_data.S > 0].drop(columns = ['gender_M', 'ID', 's_use', 'abs_error_estimate_2'])
s_data

In [None]:
# save data to excel
s_data.to_excel("model_data_S_not_zero_absERROR_NotVent.xlsx")

In [None]:
# save data to excel
data_allS = dummy_data.drop(columns = ['gender_M', 'ID', 's_use', 'abs_error_estimate_2'])
data_allS.to_excel("model_data_allS_absERROR_NotVent.xlsx")