# Imports

In [98]:
#import libraries
import pandas as pd
from functools import reduce
import numpy as np
from numpy import ma
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import chi2_contingency, ttest_ind

In [2]:
#ace
#importing 
ksads_df = pd.read_csv("abcd_ptsd01.csv", skiprows=[1]) #KSADS-5-PTSD module - parent
env_parent_df = pd.read_csv("fes02.csv", skiprows=[1]) #environment scale - parent report
env_youth_df = pd.read_csv("abcd_fes01.csv", skiprows=[1]) #environment scale - youth report
fam_his_1_df = pd.read_csv("fhxp102.csv", skiprows=[1]) #family history assessment part 1 
fam_his_2_df = pd.read_csv("fhxp201.csv", skiprows=[1]) #family history assessment part 2
pdem_df = pd.read_csv("pdem02.csv", skiprows=[1]) #demographics survey - parent
crpbi_df = pd.read_table("crpbi01.txt", skiprows=[1]) #CRPBI - youth
par_mon_df = pd.read_csv("pmq01.csv", skiprows=[1]) #parental monitoring survey

#select only timepoints we need
ksads_df = ksads_df.loc[ksads_df['eventname'] == 'baseline_year_1_arm_1']
env_parent_df = env_parent_df.loc[env_parent_df['eventname'] == 'baseline_year_1_arm_1']
env_youth_df = env_youth_df.loc[env_youth_df['eventname'] == 'baseline_year_1_arm_1']
crpbi_df = crpbi_df.loc[crpbi_df['eventname'] == 'baseline_year_1_arm_1']
par_mon_df = par_mon_df.loc[par_mon_df['eventname'] == 'baseline_year_1_arm_1']

#create function that will select the columns from the dataframes needed to calculate the ACE-score
def select_col(df, col_list):
    "takes the dataframe and a list of columns needed to be extracted from that dataframe as list and returns all rows from the selected dataframe"
    df_select = df.loc[:, col_list]
    return df_select

#create lists for the columns that need to be selected
col_ksads = ['subjectkey', 'ksads_ptsd_raw_762_p', 'ksads_ptsd_raw_763_p', 'ksads_ptsd_raw_767_p', 'ksads_ptsd_raw_768_p', 'ksads_ptsd_raw_766_p']
col_env_parent = ['subjectkey', 'fam_enviro3_p', 'fam_enviro6_p']
col_env_youth = ['subjectkey', 'fes_youth_q3', 'fes_youth_q6']
col_fam_his_1 = ['subjectkey', 'famhx_4d_p___1', 'famhx_4d_p___2', 'famhx_4d_p___3', 'famhx_4d_p___4', 'famhx_4d_p___5', 'famhx_4d_p___6', 'famhx_4d_p___7', 'famhx4a_p___1', 'famhx4a_p___2', 'famhx4a_p___3', 'famhx4a_p___4', 'famhx4a_p___5', 'famhx4a_p___6', 'famhx4a_p___7', 'fam_history_6_yes_no']
col_fam_his_2 = ['subjectkey', 'fam_history_13_yes_no']
col_pdem = ['subjectkey', 'demo_prnt_marital_v2']
col_crpbi = ['subjectkey', 'crpbi_parent4_y']
col_par_mon = ['subjectkey', 'parent_monitor_q1_y', 'parent_monitor_q3_y']

#select columns
ksads_2 = select_col(ksads_df, col_ksads)
env_parent_2 = select_col(env_parent_df, col_env_parent)
env_youth_2 = select_col(env_youth_df, col_env_youth)
fam_his_1_2 = select_col(fam_his_1_df, col_fam_his_1)
fam_his_2_2 = select_col(fam_his_2_df, col_fam_his_2)
pdem_2 = select_col(pdem_df, col_pdem)
crpbi_2 = select_col(crpbi_df, col_crpbi)
par_mon_2 = select_col(par_mon_df, col_par_mon)

#for the parent history questionnaires, replace 999 (i don't know) and 7 (refuse to answer) with a zero
fam_his_1_2 = fam_his_1_2.replace([999], 0)
fam_his_1_2 = fam_his_1_2.replace([7], 0)
fam_his_2_2 = fam_his_2_2.replace([999], 0)
fam_his_2_2 = fam_his_2_2.replace([7], 0)

In [3]:
#site location
site = pd.read_csv('abcd_lt01.csv', skiprows=[1])
site = site.loc[site['eventname'] == 'baseline_year_1_arm_1']
site = site.loc[:, ['subjectkey', 'site_id_l']]

In [4]:
#parental income, sex and age
parental_income = pdem_df.loc[:, ['subjectkey', 'demo_comb_income_v2', 'sex', 'interview_age']]

In [5]:
#iq
iq = pd.read_csv('abcd_tbss01.csv', skiprows=[1], low_memory=False)
iq = iq.loc[iq['eventname'] == 'baseline_year_1_arm_1']
iq = iq.loc[:, ['subjectkey', 'nihtbx_fluidcomp_agecorrected']]

In [6]:
#PTSD
dsm_parent = pd.read_csv('abcd_ksad01.csv', skiprows=[1])

#select only t2
dsm_parent_base = dsm_parent.loc[dsm_parent['eventname'] == 'baseline_year_1_arm_1']
dsm_parent_t2 = dsm_parent.loc[dsm_parent['eventname'] == '2_year_follow_up_y_arm_1']

#select columns
ptsd_col = ['subjectkey', 'ksads_21_923_p','ksads_21_924_p', 'ksads_21_921_p', 'ksads_21_922_p']
ptsd_df = dsm_parent_base.loc[:, ptsd_col]
ptsd_df_t2 = dsm_parent_t2.loc[:, ptsd_col]
ptsd_df_t2 = ptsd_df_t2.add_suffix('_t2') #ditinquish column names between base and t2
ptsd_df_t2 = ptsd_df_t2.rename(columns = {'subjectkey_t2':'subjectkey'})

In [7]:
#has participant ever seen mental health professional?
prof_base = pd.read_csv('dibf01.csv', skiprows=[1])
prof_t2 = pd.read_table("abcd_lpksad01.txt", skiprows=[1])

#select t2
prof_t2 = prof_t2.loc[prof_t2['eventname'] == '2_year_follow_up_y_arm_1']

#select columns
prof_base = prof_base.loc[:, ['subjectkey', 'kbi_p_c_mh_sa']]
prof_t2 = prof_t2.loc[:, ['subjectkey', 'kbi_p_c_mh_sa_l']]

In [8]:
#import CBCL
cbcl_df = pd.read_csv("abcd_cbcl01.csv", skiprows=[1])

#drop columns we do not need
cbcl_df = cbcl_df.drop(['src_subject_id', 'cbcl_select_language___1', 'dataset_id', 'collection_id', 'abcd_cbcl01_id', 'interview_date', 'collection_title', 'sex', 'interview_age'], axis=1)

#filter only baseline
cbcl_df = cbcl_df.loc[cbcl_df['eventname'] == "baseline_year_1_arm_1"]

In [9]:
#neuroimaging
qc_df = pd.read_table('abcd_imgincl01.txt') #quality control data
networks_df = pd.read_csv('mrirscor02.csv', skiprows=[1]) #rsfmri data

#select only baseline measures from quality control and rsfmri data
qc_df = qc_df.loc[qc_df['eventname'] == 'baseline_year_1_arm_1']
networks_df = networks_df.loc[networks_df['eventname'] == 'baseline_year_1_arm_1']

#extract relevant columns
qc_df = qc_df.loc[:, ['subjectkey', 'imgincl_rsfmri_include']]
networks_df = networks_df.drop(columns=['collection_title', 'collection_id', 'mrirscor02_id', 'dataset_id', 'src_subject_id', 'interview_date', 'interview_age', 'sex', 'eventname', 'rsfmri_cor_ngd_scs_visitid', 'rsfmri_cor_ngd_scs_tr', 'rsfmri_cor_ngd_scs_numtrs', 'rsfmri_cor_ngd_scs_nvols', 'rsfmri_cor_ngd_scs_stnvols', 'rsfmri_cor_ngd_scs_stcgnvols', 'rsfmri_cor_ngd_scs_ntpoints', 'rsfmri_cor_ngd_scs_meanmn', 'rsfmri_cor_ngd_scs_maxmn', 'rsfmri_cor_ngd_scs_meantrans', 'rsfmri_cor_ngd_scs_maxtrans', 'rsfmri_cor_ngd_scs_meanrot', 'rsfmri_cor_ngd_scs_maxrot'])

In [10]:
#combine all features into one df
dfs = [ksads_2, env_parent_2, env_youth_2, fam_his_1_2, fam_his_2_2, pdem_2, crpbi_2, par_mon_2, ptsd_df, ptsd_df_t2, prof_base, prof_t2, site, parental_income, iq, cbcl_df]
combined_df = reduce(lambda left,right: pd.merge(left, right, on=['subjectkey'], how='outer'), dfs)

In [11]:
combined_df

Unnamed: 0,subjectkey,ksads_ptsd_raw_762_p,ksads_ptsd_raw_763_p,ksads_ptsd_raw_767_p,ksads_ptsd_raw_768_p,ksads_ptsd_raw_766_p,fam_enviro3_p,fam_enviro6_p,fes_youth_q3,fes_youth_q6,...,cbcl_q104_p,cbcl_q105_p,cbcl_q106_p,cbcl_q107_p,cbcl_q108_p,cbcl_q109_p,cbcl_q110_p,cbcl_q111_p,cbcl_q112_p,eventname
0,NDAR_INVPNJ3GVUY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,baseline_year_1_arm_1
1,NDAR_INV5FUBUX54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
2,NDAR_INV5F786HZ9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,baseline_year_1_arm_1
3,NDAR_INVZJB6P8ZB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,baseline_year_1_arm_1
4,NDAR_INVWFEJT579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,baseline_year_1_arm_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11871,NDAR_INVFM8FB6B7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11872,NDAR_INVW36EDDBL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11873,NDAR_INVL1Y05H4Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11874,NDAR_INVAER4MX1D,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,baseline_year_1_arm_1


# Dealing with NaN values

In [12]:
#dealing with NaN
#create mask
combined_mask = combined_df.isnull()

#add subjectkeys to mask
combined_mask['subjectkey'] = combined_df['subjectkey']

#for calculating which participants have missing value on every question of subscale
def list_missing(df, col):
    """function that takes dataframe and column indexes of subscale (list) as input, calculates which participants"""
    """have a missing value on all questions of subscale and returns list of those participants"""
    df['col'] = df.loc[:, col].sum(axis=1)
    missing = df[df['col'] == len(col)]
    missing = list(missing['subjectkey'])
    return missing

#calculate which participants have missing value on variable
def list_missing_2(df2, col2):
    x = df2[df2[col2] == True]
    x = list(x['subjectkey'])
    return x


In [13]:
#ACE
physical_abuse = ['ksads_ptsd_raw_762_p', 'ksads_ptsd_raw_763_p']
sexual_abuse = ['ksads_ptsd_raw_767_p', 'ksads_ptsd_raw_768_p']
mother_violently = ['ksads_ptsd_raw_766_p', 'fam_enviro3_p', 'fam_enviro6_p', 'fes_youth_q3', 'fes_youth_q6'] #mother treated violently
substance = ['famhx_4d_p___1', 'famhx_4d_p___2', 'famhx_4d_p___3', 'famhx_4d_p___4', 'famhx_4d_p___5', 'famhx_4d_p___6', 'famhx_4d_p___7', 'famhx4a_p___1', 'famhx4a_p___2', 'famhx4a_p___3', 'famhx4a_p___4', 'famhx4a_p___5', 'famhx4a_p___6', 'famhx4a_p___7'] #substance abuse in the household
mental = ['fam_history_6_yes_no', 'fam_history_13_yes_no'] #household mental illness 
divorce = ['demo_prnt_marital_v2'] #divorce/seperation
emotional_neglect = ['crpbi_parent4_y']
physical_neglect = ['parent_monitor_q1_y', 'parent_monitor_q3_y']

physical_abuse_missing = list_missing(combined_mask, physical_abuse)
sexual_abuse_missing = list_missing(combined_mask, sexual_abuse)
mother_violently_missing = list_missing(combined_mask, mother_violently)
substance_missing = list_missing(combined_mask, substance)
mental_missing = list_missing(combined_mask, mental)
divorce_missing = list_missing(combined_mask, divorce)
emotional_neglect_missing = list_missing(combined_mask, emotional_neglect)
physical_neglect_missing = list_missing(combined_mask, physical_neglect)

demo_age_missing = list_missing_2(combined_mask, 'interview_age')
demo_sex_missing = list_missing_2(combined_mask, 'sex')

#ptsd #base
ptsd_missing = []
for i in ['ksads_21_923_p','ksads_21_924_p', 'ksads_21_921_p', 'ksads_21_922_p']:
    missing2 = combined_mask[combined_mask.loc[:, i] == True]
    ptsd_missing.append(list(missing2['subjectkey']))

ptsd_missing = [item for sublist in ptsd_missing for item in sublist] #from lists of lists to list

#ptsd t2
ptsd_t2_missing = []
for i in ['ksads_21_923_p_t2','ksads_21_924_p_t2', 'ksads_21_921_p_t2', 'ksads_21_922_p_t2']:
    missing_ptsd = combined_mask[combined_mask.loc[:, i] == True]
    ptsd_t2_missing.append(list(missing_ptsd['subjectkey']))

ptsd_t2_missing = [item for sublist in ptsd_t2_missing for item in sublist] #from lists of lists to list


#ever went to mental health professional baseline
prof_base_missing = list_missing_2(combined_mask, 'kbi_p_c_mh_sa')

#ever went to mental health professional t2
prof_t2_missing = list_missing_2(combined_mask, 'kbi_p_c_mh_sa_l')

#parental income missing
parental_income_missing = list_missing_2(combined_mask, 'demo_comb_income_v2')

#iq missing
iq_missing = list_missing_2(combined_mask, 'nihtbx_fluidcomp_agecorrected')

In [14]:
#combine all lists of participants with missing values
all_participants = (physical_abuse_missing + sexual_abuse_missing + mother_violently_missing + substance_missing + mental_missing + divorce_missing + emotional_neglect_missing + physical_neglect_missing + demo_age_missing + demo_sex_missing + ptsd_missing + ptsd_t2_missing, prof_base_missing, prof_t2_missing + parental_income_missing + iq_missing)
all_participants = [item for sublist in all_participants for item in sublist]
participants = np.unique(all_participants)

len(participants)

2404

In [15]:
combined_df

Unnamed: 0,subjectkey,ksads_ptsd_raw_762_p,ksads_ptsd_raw_763_p,ksads_ptsd_raw_767_p,ksads_ptsd_raw_768_p,ksads_ptsd_raw_766_p,fam_enviro3_p,fam_enviro6_p,fes_youth_q3,fes_youth_q6,...,cbcl_q104_p,cbcl_q105_p,cbcl_q106_p,cbcl_q107_p,cbcl_q108_p,cbcl_q109_p,cbcl_q110_p,cbcl_q111_p,cbcl_q112_p,eventname
0,NDAR_INVPNJ3GVUY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,baseline_year_1_arm_1
1,NDAR_INV5FUBUX54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
2,NDAR_INV5F786HZ9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,baseline_year_1_arm_1
3,NDAR_INVZJB6P8ZB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,baseline_year_1_arm_1
4,NDAR_INVWFEJT579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,baseline_year_1_arm_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11871,NDAR_INVFM8FB6B7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11872,NDAR_INVW36EDDBL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11873,NDAR_INVL1Y05H4Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11874,NDAR_INVAER4MX1D,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,baseline_year_1_arm_1


In [16]:
#remove participants with missing values and create new df
combined_df_2 = combined_df[~combined_df['subjectkey'].isin(participants)]

In [17]:
combined_df_2

Unnamed: 0,subjectkey,ksads_ptsd_raw_762_p,ksads_ptsd_raw_763_p,ksads_ptsd_raw_767_p,ksads_ptsd_raw_768_p,ksads_ptsd_raw_766_p,fam_enviro3_p,fam_enviro6_p,fes_youth_q3,fes_youth_q6,...,cbcl_q104_p,cbcl_q105_p,cbcl_q106_p,cbcl_q107_p,cbcl_q108_p,cbcl_q109_p,cbcl_q110_p,cbcl_q111_p,cbcl_q112_p,eventname
1,NDAR_INV5FUBUX54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
2,NDAR_INV5F786HZ9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,baseline_year_1_arm_1
3,NDAR_INVZJB6P8ZB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,baseline_year_1_arm_1
4,NDAR_INVWFEJT579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,baseline_year_1_arm_1
5,NDAR_INVG059BTMJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,baseline_year_1_arm_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11870,NDAR_INVNFVA3GM5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11871,NDAR_INVFM8FB6B7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11872,NDAR_INVW36EDDBL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11874,NDAR_INVAER4MX1D,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,baseline_year_1_arm_1


In [18]:
#drop participants with that answered 'refuse to answer' on divorce question (61 participants)
combined_df_2 = combined_df_2[combined_df_2.demo_prnt_marital_v2 != 777.0]

#drop participants that had 'not sure' to have ever seen a mental professional (50 participants)
combined_df_2 = combined_df_2[combined_df_2.kbi_p_c_mh_sa != 3.0]

#drop participants with refuse to answer to seen mental professional since last visit (10 participants)
#drop participants that had 'not sure' (99 participants)
combined_df_2 = combined_df_2[combined_df_2.kbi_p_c_mh_sa_l != 777.0]
combined_df_2 = combined_df_2[combined_df_2.kbi_p_c_mh_sa_l != 3.0]

#drop participants that had module not administered on ptsd question (1 participant)
combined_df_2 = combined_df_2[combined_df_2.ksads_21_922_p_t2 != 888.0]

In [19]:
combined_df_2

Unnamed: 0,subjectkey,ksads_ptsd_raw_762_p,ksads_ptsd_raw_763_p,ksads_ptsd_raw_767_p,ksads_ptsd_raw_768_p,ksads_ptsd_raw_766_p,fam_enviro3_p,fam_enviro6_p,fes_youth_q3,fes_youth_q6,...,cbcl_q104_p,cbcl_q105_p,cbcl_q106_p,cbcl_q107_p,cbcl_q108_p,cbcl_q109_p,cbcl_q110_p,cbcl_q111_p,cbcl_q112_p,eventname
1,NDAR_INV5FUBUX54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
2,NDAR_INV5F786HZ9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,baseline_year_1_arm_1
3,NDAR_INVZJB6P8ZB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,baseline_year_1_arm_1
4,NDAR_INVWFEJT579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,baseline_year_1_arm_1
5,NDAR_INVG059BTMJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,baseline_year_1_arm_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11870,NDAR_INVNFVA3GM5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11871,NDAR_INVFM8FB6B7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,baseline_year_1_arm_1
11872,NDAR_INVW36EDDBL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,baseline_year_1_arm_1
11874,NDAR_INVAER4MX1D,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,baseline_year_1_arm_1


# create ACE-scores

In [20]:
#calculate ace scores
def check_ace(df, colname):
    "takes dataframe and name of column as input. Checks if variables in column are > 0 (yes, 1) or not (no, 0) and appends this to list "
    check = []
    for i in df[colname]:
        if i > 0:
            check.append(1)
        else: 
            check.append(0)
    return check    

def check_divorce(df, colname):
    "takes dataframe and name of column as input. checks if variables in columns are 3 (divorced) or 4 (seperated)"
    "if divorced or seperated a 1 is added to a new list, otherwise 0 is added "
    check = []
    for i in df[colname]:
        if i == 3 or i == 4:
            check.append(1)
        else:
            check.append(0)
    return check

#works for emotional neglect
def check_emotional(df, colname):
    "takes dataframe and name column as input. Checks if variable is 1 (not like him/her) and gives 1 if this is true"
    "Other variabels get a 0"
    check = []
    for i in df[colname]:
        if i == 1:
            check.append(1)
        else:
            check.append(0)
    return check

#works for physical neglect
def check_physical(df, colname):
    "takes dataframe and column name as input. Checks if variable is 1 (never) or 2 (almost never)"
    "if so, a 1 is added to a new list, otherwise a 0 is added"
    check = []
    for i in df[colname]:
        if i == 1 or i == 2:
            check.append(1)
        else:
            check.append(0)
    return check

#physical abuse
combined_df_2 = combined_df_2.assign(physical_sum = combined_df.loc[:, physical_abuse].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(physical_a = check_ace(combined_df_2, 'physical_sum'))

#sexual abuse
combined_df_2 = combined_df_2.assign(sexual_sum = combined_df.loc[:, sexual_abuse].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(sexual_a= check_ace(combined_df_2, 'sexual_sum'))

#mother treated violently
combined_df_2 = combined_df_2.assign(mother_v_sum = combined_df.loc[:, mother_violently].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(mother_v = check_ace(combined_df_2, 'mother_v_sum'))

#substance abuse in household
combined_df_2 = combined_df_2.assign(substance_sum = combined_df_2.loc[:, substance].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(substance_a = check_ace(combined_df_2, 'substance_sum'))

#household mentall illness
combined_df_2 = combined_df_2.assign(mental_sum = combined_df_2.loc[:, mental].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(mental_i = check_ace(combined_df_2, 'mental_sum'))

#divorce/seperation
combined_df_2 = combined_df_2.assign(divorce_s = check_divorce(combined_df_2, 'demo_prnt_marital_v2'))

#emotional neglect
combined_df_2 = combined_df_2.assign(emotional_n = check_emotional(combined_df_2, 'crpbi_parent4_y'))

#physical_neglect
combined_df_2 = combined_df_2.assign(physical_1 = check_physical(combined_df_2, 'parent_monitor_q1_y'))
combined_df_2 = combined_df_2.assign(physical_2 = check_physical(combined_df_2, 'parent_monitor_q3_y'))
combined_df_2 = combined_df_2.assign(physical_neg_sum = combined_df_2.loc[:, ['physical_1', 'physical_2']].fillna(0).sum(axis=1))
combined_df_2 = combined_df_2.assign(physical_n = check_ace(combined_df_2, 'physical_neg_sum'))

#total ace score
combined_df_2 = combined_df_2.assign(ace_score = combined_df_2.loc[:, ['physical_a', 'sexual_a', 'mother_v', 'substance_a', 'mental_i', 'divorce_s', 'emotional_n', 'physical_n']].sum(axis=1))

# model 1

In [21]:
#create variables for ptsd_base and ptsd_t2 
combined_df_2 = combined_df_2.assign(ptsd_base_sum = combined_df_2.loc[:, ['ksads_21_921_p', 'ksads_21_922_p', 'ksads_21_923_p', 'ksads_21_924_p']].sum(axis=1))
combined_df_2 = combined_df_2.assign(ptsd_t2_sum = combined_df_2.loc[:, ['ksads_21_921_p_t2', 'ksads_21_922_p_t2', 'ksads_21_923_p_t2', 'ksads_21_924_p_t2']].sum(axis=1))
combined_df_2 = combined_df_2.assign(ptsd_base = check_ace(combined_df_2, 'ptsd_base_sum'))
combined_df_2 = combined_df_2.assign(ptsd_t2 = check_ace(combined_df_2, 'ptsd_t2_sum'))

In [22]:
#remove participants thad had ptsd at baseline
combined_df_2 = combined_df_2[combined_df_2.ptsd_base != 1]

#create target. healthy control if not been to mental professional at baseline AND t2.
#ptsd if not been to mental professional at baseline and developed PTSD at t2. 
for index, row in combined_df_2.iterrows():
    if row['kbi_p_c_mh_sa'] == 0 and row['kbi_p_c_mh_sa_l'] == 2.0:
        combined_df_2.loc[index, 'target'] = 0
    elif row['kbi_p_c_mh_sa'] == 0 and row['ptsd_t2'] == 1:
        combined_df_2.loc[index, 'target'] = 1

In [23]:
combined_df_2.target.value_counts()

0.0    6961
1.0      57
Name: target, dtype: int64

In [24]:
combined_df_2.shape

(8845, 187)

In [25]:
#change sex to binary coding
combined_df_2['sex'] = combined_df_2['sex'].replace({'F': 0, 'M': 1})

In [26]:
#restore row counts
combined_df_2 = combined_df_2.reset_index(drop=True)

In [27]:
#create final df
drop_columns = ['ksads_ptsd_raw_762_p',
 'ksads_ptsd_raw_763_p',
 'ksads_ptsd_raw_767_p',
 'ksads_ptsd_raw_768_p',
 'ksads_ptsd_raw_766_p',
 'fam_enviro3_p',
 'fam_enviro6_p',
 'fes_youth_q3',
 'fes_youth_q6',
 'famhx_4d_p___1',
 'famhx_4d_p___2',
 'famhx_4d_p___3',
 'famhx_4d_p___4',
 'famhx_4d_p___5',
 'famhx_4d_p___6',
 'famhx_4d_p___7',
 'famhx4a_p___1',
 'famhx4a_p___2',
 'famhx4a_p___3',
 'famhx4a_p___4',
 'famhx4a_p___5',
 'famhx4a_p___6',
 'famhx4a_p___7',
 'fam_history_6_yes_no',
 'fam_history_13_yes_no',
 'demo_prnt_marital_v2',
 'crpbi_parent4_y',
 'parent_monitor_q1_y',
 'parent_monitor_q3_y',
 'ksads_21_923_p',
 'ksads_21_924_p',
 'ksads_21_921_p',
 'ksads_21_922_p',
 'ksads_21_923_p_t2',
 'ksads_21_924_p_t2',
 'ksads_21_921_p_t2',
 'ksads_21_922_p_t2',
 'kbi_p_c_mh_sa',
 'kbi_p_c_mh_sa_l', 
'eventname',
'physical_sum',
'sexual_sum',
'mother_v_sum',
'substance_sum',
'mental_sum',
'physical_1', 
'physical_2',
'physical_neg_sum',
'ptsd_base_sum',
'ptsd_t2_sum',
'ptsd_base',
'ptsd_t2']

combined_df_3 = combined_df_2.drop(columns=drop_columns)

In [28]:
combined_df_3.target.value_counts()

0.0    6961
1.0      57
Name: target, dtype: int64

# matching model 1

In [29]:
patients = combined_df_3[combined_df_3['target'] ==  1]
controls = combined_df_3[combined_df_3['target'] ==  0]

In [30]:
#make list of neuroimaging sites that patients went to
sites = np.unique(patients['site_id_l'].to_list())
match_df = combined_df_3[combined_df_3['site_id_l'].isin(sites)]

#recount rows 
match_df = match_df.reset_index(drop=True)

In [31]:
sites

array(['site01', 'site02', 'site03', 'site04', 'site05', 'site06',
       'site07', 'site09', 'site10', 'site11', 'site12', 'site13',
       'site14', 'site15', 'site16', 'site17', 'site18', 'site21'],
      dtype='<U6')

In [32]:
#create seperate dfs for every scanning site
site_dfs = {}

for site_id in match_df['site_id_l'].unique():
    site_df = match_df.loc[match_df['site_id_l'] == site_id]
    site_dfs[site_id] = site_df
    
# iterate over each site ID and store the filtered dataframe in a variable named after site
for site_id in match_df['site_id_l'].unique():
    site_name = site_id
    site_df = site_dfs[site_name]
    
    # create separate dataframes for target == 1 and target == 0
    for target_val in [0, 1]:
        target_df = site_df.loc[site_df['target'] == target_val]
        target_name = f'{site_name}_target_{target_val}'
        globals()[target_name] = target_df

In [33]:
#matchhing for site01
targets01 = site01_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']] #three targets

# filter the site01_target_01 dataframe
site01_target_0_filtered = site01_target_0.loc[
    (site01_target_0['sex'] == 1) &
    (site01_target_0['nihtbx_fluidcomp_agecorrected'].between(87.0, 99.0)) &
    (site01_target_0['interview_age'].isin([116, 125, 129])) &
    (site01_target_0['demo_comb_income_v2'].between(8.0, 9.0))
]

matched01 = pd.concat([site01_target_1, site01_target_0_filtered])

In [34]:
#matching for site02
site02_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']] 

# filter the site02_target_0 dataframe
site02_target_0_filtered = site02_target_0.loc[    (site02_target_0['sex'].isin([0, 1]) &
    (site02_target_0['nihtbx_fluidcomp_agecorrected'].isin([78.0, 79.0, 80.0, 81.0, 82.0, 86.0, 97.0, 113.0])) &
    (site02_target_0['interview_age'].between(104, 130)) &
    (site02_target_0['demo_comb_income_v2'].between(7.0, 9.0)) &
    (site02_target_0['subjectkey'].isin(['NDAR_INV12NED96V', 'NDAR_INVUEB7ZA46', 'NDAR_INV9KF0YHH2', 'NDAR_INVGMYU4GT3']))
)]

matched02 = pd.concat([site02_target_1, site02_target_0_filtered])

In [35]:
#matching for site03
site03_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']] 

#filter site02 target 0
site03_target_0_filtered = site03_target_0[site03_target_0['subjectkey'].isin(['NDAR_INVB3L1EM9T', 'NDAR_INVGDX7FYYH', 'NDAR_INV936LMN28'])]

matched03 = pd.concat([site03_target_1, site03_target_0_filtered])



In [36]:
#matching for site04
site04_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site04_target_0_filtered = site04_target_0.loc[
    (site04_target_0['sex'] == 0) &
    (site04_target_0['nihtbx_fluidcomp_agecorrected'].between(87.0, 99.0)) &
    (site04_target_0['interview_age'] == 116) &age_std_ptsd
    (site04_target_0['demo_comb_income_v2'] == 4.0)
]

matched04 = pd.concat([site04_target_1, site04_target_0_filtered])

In [37]:
#matching for site05
site05_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site05_target_0_filtered = site05_target_0[site05_target_0['subjectkey'].isin(['NDAR_INVK9E4206N', 'NDAR_INV686DFAG4', 'NDAR_INVZC6LB4FJ', 'NDAR_INVKCZZYZB5', 'NDAR_INVR6G17MWX', 'NDAR_INVYF1F96G4'])]

matched05 = pd.concat([site05_target_1, site05_target_0_filtered])

In [38]:
#matching for site06
site06_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

#fill in the NaN value in the target group (SES) with the mode. 
mode06 = site06_target_1['demo_comb_income_v2'].mode()[0]
site06_target_1['demo_comb_income_v2'] = site06_target_1['demo_comb_income_v2'].replace(777.0, mode06)

site06_target_0_filtered = site06_target_0[site06_target_0['subjectkey'].isin(['NDAR_INV7KN1R7KK', 'NDAR_INVL41WXZ15', 'NDAR_INVB5LU9UPG', 'NDAR_INVN6R4KJPT', 'NDAR_INV4RFZF7PZ', 'NDAR_INVP3U1CUBR', 'NDAR_INVBZT4D3W2'])]

matched06 = pd.concat([site06_target_1, site06_target_0_filtered])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site06_target_1['demo_comb_income_v2'] = site06_target_1['demo_comb_income_v2'].replace(777.0, mode06)


In [39]:
#matching for site07
site07_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]


site07_target_0_filtered = site07_target_0[site07_target_0['subjectkey'].isin(['NDAR_INV58GL6F39', 'NDAR_INV808X1WYW'])]

matched07 = pd.concat([site07_target_1, site07_target_0_filtered])

In [40]:
#matching for site09
site09_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]


site09_target_0_filtered = site09_target_0.loc[
    (site09_target_0['sex'] == 0) &
    (site09_target_0['nihtbx_fluidcomp_agecorrected'].between(110.0, 135.0)) &
    (site09_target_0['interview_age'].between(115, 125)) &
    (site09_target_0['demo_comb_income_v2'] == 8.0)
]

matched09 = pd.concat([site09_target_1, site09_target_0_filtered])

In [41]:
#matching for site10
site10_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site10_target_0_filtered = site10_target_0.loc[
    (site10_target_0['sex'] == 1) &
    (site10_target_0['nihtbx_fluidcomp_agecorrected'].between(75.0, 90.0)) &
    (site10_target_0['interview_age'] == 111) &
    (site10_target_0['demo_comb_income_v2'] == 9.0)
]

matched10 = pd.concat([site10_target_1, site10_target_0_filtered])

In [42]:
#matching for site11
site11_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

#fill in the NaN value in the target group (SES) with the mode. 
mode11 = site11_target_1['demo_comb_income_v2'].mode()[0]
site11_target_1['demo_comb_income_v2'] = site11_target_1['demo_comb_income_v2'].replace(999.0, mode11)

site11_target_0_filtered = site11_target_0[site11_target_0['subjectkey']. isin(['NDAR_INVV9B0R8UM','NDAR_INV3RDMV5NE'])]

matched11 = pd.concat([site11_target_1, site11_target_0_filtered])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site11_target_1['demo_comb_income_v2'] = site11_target_1['demo_comb_income_v2'].replace(999.0, mode11)


In [43]:
#matching for site12
site12_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

#fill in the NaN value in the target group (SES) with the mode. 
mode12 = site12_target_1['demo_comb_income_v2'].mode()[0]
site12_target_1['demo_comb_income_v2'] = site12_target_1['demo_comb_income_v2'].replace(777.0, mode12)

site12_target_0_filtered = site12_target_0[site12_target_0['subjectkey'].isin(['NDAR_INVX7RUVFDV', 'NDAR_INVM5GN30FZ', 'NDAR_INVUEP9AVEL'])]
                                          
matched12 = pd.concat([site12_target_1, site12_target_0_filtered])                                  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site12_target_1['demo_comb_income_v2'] = site12_target_1['demo_comb_income_v2'].replace(777.0, mode12)


In [44]:
#matching for site13
site13_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site13_target_0_filtered = site13_target_0[site13_target_0['subjectkey'].isin(['NDAR_INVYHTJTKWD', 'NDAR_INVBJUB7H8B'])]

matched13 = pd.concat([site13_target_1, site13_target_0_filtered])

In [45]:
#matching for site14
site14_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site14_target_0_filtered = site14_target_0[site14_target_0['subjectkey'].isin(['NDAR_INV37C7BURT', 'NDAR_INVLGJH5VV9', 'NDAR_INVDMMAKV5Y'])]
                                          
matched14 = pd.concat([site14_target_1, site14_target_0_filtered])

In [46]:
#matching for site15
site15_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site15_target_0_filtered = site15_target_0[site15_target_0['subjectkey'].isin(['NDAR_INVYGA22KK4', 'NDAR_INV7MGHKUJM', 'NDAR_INVGV7AT23A'])]

matched15 = pd.concat([site15_target_1, site15_target_0_filtered])

In [47]:
#matching for site16
site16_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

#fill in the NaN value in the target group (SES) with the mode. 
mode16 = site16_target_1['demo_comb_income_v2'].mode()[0]
site16_target_1['demo_comb_income_v2'] = site16_target_1['demo_comb_income_v2'].replace(999.0, mode16)

#fill NaN in IQ column with the mean
mean16 = site16_target_1['nihtbx_fluidcomp_agecorrected'].mean()
rounded_mean16 = round(mean16, 0)
site16_target_1['nihtbx_fluidcomp_agecorrected'] = site16_target_1['nihtbx_fluidcomp_agecorrected'].replace(0.0, rounded_mean16)

site16_target_0_filtered = site16_target_0[site16_target_0['subjectkey'].isin(['NDAR_INV0UA1UW6C', 'NDAR_INVYDTVRPWV', 'NDAR_INVKXX86222', 'NDAR_INVVH54ZRW3', 'NDAR_INV37FHEA6B', 'NDAR_INV9DJ99NND', 'NDAR_INVVCLE058V'])]

matched16 = pd.concat([site16_target_1, site16_target_0_filtered])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site16_target_1['demo_comb_income_v2'] = site16_target_1['demo_comb_income_v2'].replace(999.0, mode16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site16_target_1['nihtbx_fluidcomp_agecorrected'] = site16_target_1['nihtbx_fluidcomp_agecorrected'].replace(0.0, rounded_mean16)


In [48]:
#matching for site17
site17_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site17_target_0_filtered = site17_target_0.loc[
    (site17_target_0['sex'] == 0) &
    (site17_target_0['nihtbx_fluidcomp_agecorrected'].between(90.0, 91.0)) &
    (site17_target_0['interview_age'].between(120, 125)) &
    (site17_target_0['demo_comb_income_v2'] == 8.0)
]

matched17 = pd.concat([site17_target_1, site17_target_0_filtered])

In [49]:
#matching for site18
site18_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

site18_target_0_filtered = site18_target_0[site18_target_0['subjectkey'].isin(['NDAR_INVFH4ZYJY6', 'NDAR_INVE4PWCYC0', 'NDAR_INVHP17VUZM'])]

matched18 = pd.concat([site18_target_1, site18_target_0_filtered])

In [50]:
#matching for site21
site21_target_1.loc[:, ['sex', 'interview_age', 'demo_comb_income_v2', 'nihtbx_fluidcomp_agecorrected']]

#fill in the NaN value in the target group (SES) with the mode. 
mode21 = site21_target_1['demo_comb_income_v2'].mode()[0]
site21_target_1['demo_comb_income_v2'] = site21_target_1['demo_comb_income_v2'].replace(777.0, mode21)

site21_target_0_filtered = site21_target_0[site21_target_0['subjectkey'].isin(['NDAR_INVK1YTJYND', 'NDAR_INV80Z1RNR4', 'NDAR_INVRRFZW203', 'NDAR_INVXRBWGD7W', 'NDAR_INVGD1EZA7B'])]

matched21 = pd.concat([site21_target_1, site21_target_0_filtered])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site21_target_1['demo_comb_income_v2'] = site21_target_1['demo_comb_income_v2'].replace(777.0, mode21)


In [51]:
model1 = pd.concat([matched01, matched02, matched03, matched04, matched05, matched06, matched07, matched09, matched10, matched11, matched12, matched13, matched14, matched15, matched16, matched17, matched18, matched21])

In [52]:
model1.target.value_counts()

0.0    57
1.0    57
Name: target, dtype: int64

In [53]:
model1_treatment = model1[model1['target'] ==  1]
model1_control = model1[model1['target'] ==  0]

# check the means for control and treatment
model1.groupby('target').mean()

Unnamed: 0_level_0,demo_comb_income_v2,sex,interview_age,nihtbx_fluidcomp_agecorrected,cbcl_q01_p,cbcl_q02_p,cbcl_q03_p,cbcl_q04_p,cbcl_q05_p,cbcl_q06_p,...,cbcl_q112_p,physical_a,sexual_a,mother_v,substance_a,mental_i,divorce_s,emotional_n,physical_n,ace_score
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,7.964912,0.473684,118.912281,96.877193,0.22807,0.0,0.350877,0.350877,0.070175,0.017544,...,0.333333,0.0,0.017544,0.421053,0.140351,0.684211,0.175439,0.0,0.017544,1.45614
1.0,7.719298,0.45614,118.894737,96.719298,0.280702,0.0,0.77193,0.596491,0.210526,0.035088,...,0.596491,0.017544,0.0,0.403509,0.175439,0.754386,0.105263,0.0,0.0,1.45614


In [54]:
# student's t-test for covariates
from scipy.stats import ttest_ind

#age
print('the statistics for age are:', model1_control.interview_age.mean(), model1_treatment.interview_age.mean())
_, p = ttest_ind(model1_control.interview_age, model1_treatment.interview_age)
print(f'p={p:.3f}')

#sex
print('the statistics for sex are:', model1_control.sex.mean(), model1_treatment.sex.mean())
_, p = ttest_ind(model1_control.sex, model1_treatment.sex)
print(f'p={p:.3f}')

#IQ
print('the statistics for IQ are:', model1_control.nihtbx_fluidcomp_agecorrected.mean(), model1_treatment.nihtbx_fluidcomp_agecorrected.mean())
_, p = ttest_ind(model1_control.nihtbx_fluidcomp_agecorrected, model1_treatment.nihtbx_fluidcomp_agecorrected)
print(f'p={p:.3f}')

#parental income
print('the statistics for parental income are:', model1_control.demo_comb_income_v2.mean(), model1_treatment.demo_comb_income_v2.mean())
_, p = ttest_ind(model1_control.demo_comb_income_v2, model1_treatment.demo_comb_income_v2)
print(f'p={p:.3f}')

the statistics for age are: 118.91228070175438 118.89473684210526
p=0.990
the statistics for sex are: 0.47368421052631576 0.45614035087719296
p=0.853
the statistics for IQ are: 96.87719298245614 96.71929824561404
p=0.959
the statistics for parental income are: 7.964912280701754 7.719298245614035
p=0.426


In [55]:
#export model1
model1.to_csv('model1.csv')

In [56]:
model1.target.value_counts()

0.0    57
1.0    57
Name: target, dtype: int64

In [73]:
model1_treatment.sex.value_counts()

0    31
1    26
Name: sex, dtype: int64

In [58]:
model1_control.sex.value_counts()

0    30
1    27
Name: sex, dtype: int64

# Statistical testing

In [92]:
#Gender
# Create a contingency table for gender and group
contingency_table = np.array([[sum(model1_treatment['sex'] == 0), sum(model1_treatment['sex'] == 1)],
                              [sum(model1_control['sex'] == 0), sum(model1_control['sex'] == 1)]])

# Perform the chi-square test for independence
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Print the test statistic and p-value
print("Chi-square test statistic:", chi2)
print("p-value:", p_value)


Chi-square test statistic: 0.0
p-value: 1.0


In [145]:
#Age
#Calculate total mean and SD
age_mean = model1['interview_age'].mean()
age_std = model1['interview_age'].std()

#Calculate mean and SD per group
age_mean_ptsd = model1_treatment['interview_age'].mean()
age_std_ptsd = model1_treatment['interview_age'].std()
age_mean_control = model1_control['interview_age'].mean()
age_std_control = model1_control['interview_age'].std()

#Perform independent samples t-test
t_statistic_age, p_value_age = ttest_ind(model1_treatment['interview_age'], model1_control['interview_age'])

#print results
print("t-statistic:", t_statistic_age)
print("p_value:", p_value_age)

t-statistic: -0.012939399743708535
p_value: 0.9896991616925653


In [155]:
#IQ
#calculate total mean and SD
iq_mean = model1['nihtbx_fluidcomp_agecorrected'].mean()
iq_std = model1['nihtbx_fluidcomp_agecorrected'].std()

#calculate mean and SD per group
iq_mean_ptsd = model1_treatment['nihtbx_fluidcomp_agecorrected'].mean()
iq_std_ptsd = model1_treatment['nihtbx_fluidcomp_agecorrected'].std()
iq_mean_control = model1_control['nihtbx_fluidcomp_agecorrected'].mean()
iq_std_control = model1_control['nihtbx_fluidcomp_agecorrected'].std()

#perform independent samples t-test
t_statistic_iq, p_value_iq = ttest_ind(model1_treatment['nihtbx_fluidcomp_agecorrected'], model1_control['nihtbx_fluidcomp_agecorrected'])

#print results
print("t-statistic:", t_statistic_iq)
print("p_value:", p_value_iq)

t-statistic: -0.05192633133668757
p_value: 0.9586798933374465


In [156]:
iq_std

16.161343491293188

In [122]:
#SES
t_statistic_SES, p_value_SES = ttest_ind(model1_treatment['demo_comb_income_v2'], model1_control['demo_comb_income_v2'])

#print results
print("t-statistic:", t_statistic_SES)
print("p_value:", p_value_SES)

t-statistic: -0.7992546871562387
p_value: 0.4258345171614569


In [163]:
print(model1['demo_comb_income_v2'].std())

1.6379295551883992


In [128]:
model1_treatment.columns

Index(['subjectkey', 'site_id_l', 'demo_comb_income_v2', 'sex',
       'interview_age', 'nihtbx_fluidcomp_agecorrected', 'cbcl_q01_p',
       'cbcl_q02_p', 'cbcl_q03_p', 'cbcl_q04_p',
       ...
       'physical_a', 'sexual_a', 'mother_v', 'substance_a', 'mental_i',
       'divorce_s', 'emotional_n', 'physical_n', 'ace_score', 'target'],
      dtype='object', length=135)

# model 2

In [59]:
#extract target
target = combined_df_3[['subjectkey', 'target']].copy()

In [60]:
#combine neuroimaging dfs
imaging_df = pd.merge(qc_df, networks_df, on='subjectkey')

In [61]:
#add target variable
imaging_df = pd.merge(imaging_df, target, on='subjectkey')

In [62]:
#extract participants from matched model 
matched_participants = model1['subjectkey'].tolist()

#select participants from neuroimaging df that are in subjectkey list
model2 = imaging_df[imaging_df['subjectkey'].isin(matched_participants)].copy()

In [63]:
#select participants with good imaging quality (incl)
model2 = model2.loc[imaging_df['imgincl_rsfmri_include'] == '1']

In [64]:
#check missing values in neuroimaging data
#check missing values
missing_values = model2.isna().any()
missing_values_count = model2.isna().sum()
missing_values_count.value_counts()

0    250
dtype: int64

In [65]:
#export model2
model2.to_csv('model2.csv')

In [66]:
model2.target.value_counts()

0.0    48
1.0    45
Name: target, dtype: int64

In [67]:
model2.columns.tolist()

['subjectkey',
 'imgincl_rsfmri_include',
 'rsfmri_cor_ngd_au_scs_crcxlh',
 'rsfmri_cor_ngd_au_scs_thplh',
 'rsfmri_cor_ngd_au_scs_cdelh',
 'rsfmri_cor_ngd_au_scs_ptlh',
 'rsfmri_cor_ngd_au_scs_pllh',
 'rsfmri_cor_ngd_au_scs_bs',
 'rsfmri_cor_ngd_au_scs_hplh',
 'rsfmri_cor_ngd_au_scs_aglh',
 'rsfmri_cor_ngd_au_scs_aalh',
 'rsfmri_cor_ngd_au_scs_vtdclh',
 'rsfmri_cor_ngd_au_scs_crcxrh',
 'rsfmri_cor_ngd_au_scs_thprh',
 'rsfmri_cor_ngd_au_scs_cderh',
 'rsfmri_cor_ngd_au_scs_ptrh',
 'rsfmri_cor_ngd_au_scs_plrh',
 'rsfmri_cor_ngd_au_scs_hprh',
 'rsfmri_cor_ngd_au_scs_agrh',
 'rsfmri_cor_ngd_au_scs_aarh',
 'rsfmri_cor_ngd_au_scs_vtdcrh',
 'rsfmri_cor_ngd_cerc_scs_crcxlh',
 'rsfmri_cor_ngd_cerc_scs_thplh',
 'rsfmri_cor_ngd_cerc_scs_cdelh',
 'rsfmri_cor_ngd_cerc_scs_ptlh',
 'rsfmri_cor_ngd_cerc_scs_pllh',
 'rsfmri_cor_ngd_cerc_scs_bs',
 'rsfmri_cor_ngd_cerc_scs_hplh',
 'rsfmri_cor_ngd_cerc_scs_aglh',
 'rsfmri_cor_ngd_cerc_scs_aalh',
 'rsfmri_cor_ngd_cerc_scs_vtdclh',
 'rsfmri_cor_ngd_cerc_scs

In [68]:
model2.shape

(93, 250)