### Main data cleaning

The data cleaning pipeline has four stages:
1. remove outliers using a) interquartile ratio with cutoff of 2.5 and b) "natural" bounds from other research or experimental design
2. standardize continuous variables using StandardScaler
3. calculate interaction terms between anxiety and depression
4. stratify bad life events (top/bottom 15%) and (maybe) calculate interactions between bad and good life events
5. propogate fixed demographic variables captured at baseline across subsequent timelines

The target variables for each step are specified in `data_cleaning.py` and interaction functions are defined in `interactions.py`. It is straightforward to edit either of these files to add/remove variables and interactions.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import data_cleaning
import interactions

In [2]:
output_dir = "output/"

In [3]:
todays_date = date.today().strftime('%Y%m%d')
todays_date

'20240909'

In [4]:
# Read in raw data
raw_data_file_name = f"RAW_ABCD_5.1_panel_{todays_date}.csv"
data = pd.read_csv(output_dir + raw_data_file_name, index_col=0, low_memory=False)

### Remove outliers

In [5]:
# for IQR outlier removal
outliers = list(data_cleaning.outlier_vars.keys())
data[outliers] = data_cleaning.remove_outlier_IQR(data[outliers], cutoff=2.5)

In [6]:
# for logical/prior bounds outlier removal
for var, bounds in data_cleaning.outlier_vars.items():
    data[var] = data_cleaning.remove_outlier_bounds(data[var], bounds)

In [7]:
t0 = data.query('time == 0').reset_index(drop=True)
t1 = data.query('time == 1').reset_index(drop=True)
t2 = data.query('time == 2').reset_index(drop=True)
t3 = data.query('time == 3').reset_index(drop=True)
t4 = data.query('time == 4').reset_index(drop=True)

### Standardize

In [8]:
t0[data_cleaning.standardize_vars] = data_cleaning.standardize(t0[data_cleaning.standardize_vars])
t1[data_cleaning.standardize_vars] = data_cleaning.standardize(t1[data_cleaning.standardize_vars])
t2[data_cleaning.standardize_vars] = data_cleaning.standardize(t2[data_cleaning.standardize_vars])
t3[data_cleaning.standardize_vars] = data_cleaning.standardize(t3[data_cleaning.standardize_vars])
t4[data_cleaning.standardize_vars] = data_cleaning.standardize(t4[data_cleaning.standardize_vars])

# Errors mean that certain columns are all zero?

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


### Calculate interactions

In [9]:
t0 = interactions.add_interactions(t0).copy()
t1 = interactions.add_interactions(t1).copy()
t2 = interactions.add_interactions(t2).copy()
t3 = interactions.add_interactions(t3).copy()
t4 = interactions.add_interactions(t4).copy()

  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df.loc[:, 'asd_ssrs_sum'] = df.loc[:, 'avoids_eyecontact_p'] + df.loc[:, 'narrow_interests_p'] + df.loc[:, 'sensory_sensitivity_p'] + df.loc[:,'concentration_on_parts_p'] + df.loc[:, 'face_understanding']
  return _nanquantile_unchecked(
  df['asdadhd_c'] = asdadhd_interaction(df, lower, upper)
  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  return _nanquantile_unchecked(
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df.loc[:, 'asd_ssrs_sum'] = df.loc[:, 'avoids_eyecontact_p'] + df.loc[:, 'narrow_interests_p'] 

### Propogate fixed variables

In [10]:
fixed = t0[['subject'] + data_cleaning.fixed_vars].copy()
t0.drop(columns=data_cleaning.fixed_vars, inplace=True)
t1.drop(columns=data_cleaning.fixed_vars, inplace=True)
t2.drop(columns=data_cleaning.fixed_vars, inplace=True)
t3.drop(columns=data_cleaning.fixed_vars, inplace=True)
t4.drop(columns=data_cleaning.fixed_vars, inplace=True)

In [11]:
t0 = pd.merge(t0, fixed, on='subject', how='left')
t1 = pd.merge(t1, fixed, on='subject', how='left')
t2 = pd.merge(t2, fixed, on='subject', how='left')
t3 = pd.merge(t3, fixed, on='subject', how='left')
t4 = pd.merge(t4, fixed, on='subject', how='left')

### Join time points

In [12]:
final_clean_data = pd.concat([t0, t1, t2, t3, t4])

### Statify ALEs and calculate parent ale sum score

In [13]:
final_clean_data["ale_ksads_sum_score_p"] = (
  final_clean_data["car_accident_hurt_p"] +
  final_clean_data["big_accident_need_treatment_p"] +
  final_clean_data["fire_victim_p"] +
  final_clean_data["natural_disaster_victim_p"] +
  final_clean_data["terrorism_victim_p"] +
  final_clean_data["war_death_witness_p"] +
  final_clean_data["stabbing_shooting_witness_p"] +
  final_clean_data["stabbing_shooting_victim_community_p"] +
  final_clean_data["stabbing_shooting_victim_home_p"] +
  final_clean_data["beating_victim_home_p"] +
  final_clean_data["stranger_threatened_child_victim_p"] +
  final_clean_data["family_threatened_child_victim_p"] +
  final_clean_data["adult_family_fighting_victim_p"] +
  final_clean_data["domestic_child_sexually_abuse_victim_p"] +
  final_clean_data["foreign_child_sexually_abuse_victim_p"] +
  final_clean_data["peer_child_sexually_abuse_victim_p"] +
  final_clean_data["sudden_death_in_family_p"])

In [14]:
# Remove intersex

final_clean_data = final_clean_data[(final_clean_data["sex"] == 1) | (final_clean_data["sex"] == 2)]

In [15]:
final_clean_data

Unnamed: 0,subject,time,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,tb_cardsort,tb_pattern,...,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,asd_diagnosis,schizophrenia_diagnosis,ale_ksads_sum_score_p
1,NDAR_INV005V6D2C,0,,,1.0,0.894093,-1.566543,-0.037403,0.414160,0.417730,...,0.0,0.0,,,0.0,9.0,9.0,0.0,0.0,0.0
2,NDAR_INV007W6H7B,0,,,2.0,0.894093,-0.250434,0.436005,0.414160,0.055298,...,0.0,0.0,,,0.0,12.0,21.0,0.0,0.0,1.0
3,NDAR_INV00BD7VDC,0,,,2.0,-0.517482,1.504379,3.005936,-0.443090,1.051986,...,0.0,0.0,,,0.0,12.0,18.0,0.0,0.0,0.0
4,NDAR_INV00CY2MDM,0,,,1.0,-0.870376,1.065676,-1.389998,-0.179321,-0.352439,...,0.0,0.0,,,0.0,10.0,8.0,0.0,0.0,2.0
5,NDAR_INV00HEV6HB,0,,,1.0,-0.870376,-1.566543,-0.510811,-0.179321,0.417730,...,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,0.008970,-0.004988,,-0.864139,2.060190,0.383797,,1.377114,...,0.0,0.0,,,1.0,3.0,,0.0,0.0,
4750,NDAR_INVZZPKBDAC,4,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,,-1.806575,...,0.0,0.0,,,0.0,1.0,14.0,0.0,0.0,
4751,NDAR_INVZZZ2ALR6,4,-0.973298,0.979193,,0.718683,-0.740938,2.614582,,-0.027454,...,0.0,0.0,,,0.0,11.0,10.0,0.0,0.0,
4752,NDAR_INVZZZNB0XC,4,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,,0.487554,...,0.0,0.0,,1.0,1.0,2.0,12.0,0.0,0.0,


In [16]:
# rearrange columns
sex = final_clean_data.pop("sex")
income = final_clean_data.pop("parent_income")

final_clean_data.insert(2, "parent_income", income)
final_clean_data.insert(2, "sex", sex)

### Exclusion

In [17]:
grouping = final_clean_data.groupby("subject")

In [18]:
# self-reports

self_report_ale_4 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_k"].notna().sum() >= 4) and (time_points["b_lifeevents_ss_k"].iloc[:4] <= 2).all())
self_report_ale_5 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_k"].notna().sum() == 5) and (time_points["b_lifeevents_ss_k"] <= 2).all())
print(self_report_ale_4["subject"].nunique())
print(self_report_ale_5["subject"].nunique())

3996
1576


In [19]:
# parent reports exclude more than 2
parent_report_ale_4_more_than_2 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() >= 4) and (time_points["b_lifeevents_ss_p"].iloc[:4] <= 2).all())
parent_report_ale_5_more_than_2 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() == 5) and (time_points["b_lifeevents_ss_p"] <= 2).all())

print(parent_report_ale_4_more_than_2["subject"].nunique())
print(parent_report_ale_5_more_than_2["subject"].nunique())

7734
3349


In [20]:
# parent reports exclude more than 1
parent_report_ale_4_more_than_1 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() >= 4) and (time_points["b_lifeevents_ss_p"].iloc[:4] <= 1).all())
parent_report_ale_5_more_than_1 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() == 5) and (time_points["b_lifeevents_ss_p"] <= 1).all())

print(parent_report_ale_4_more_than_1["subject"].nunique())
print(parent_report_ale_5_more_than_1["subject"].nunique())

6069
2506


In [21]:
# parent reports exclude more than 0
parent_report_ale_4_more_than_0 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() >= 4) and (time_points["b_lifeevents_ss_p"].iloc[:4] == 0).all())
parent_report_ale_5_more_than_0 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_p"].notna().sum() == 5) and (time_points["b_lifeevents_ss_p"] == 0).all())

print(parent_report_ale_4_more_than_0["subject"].nunique())
print(parent_report_ale_5_more_than_0["subject"].nunique())

3199
1231


In [22]:
# parent reports

def f (time_points):
  interest = time_points["ale_ksads_sum_score_p"]
  return (interest[interest.last_valid_index()] == 0).all() if interest.last_valid_index() != None else False

parent_reports = grouping.filter(f)
print(parent_reports["subject"].nunique())

7710


In [23]:
# HIGHER ADI MEANS MORE DISADVANTAGED, so take out the top people
# national -> compute using national percentages
# otherwise compute using sample proportion

national = True
adis = None
threshold = 0.2

def g1 (time_points):
  interest = time_points["area_deprivation_idx_perc"]
  return (interest[interest.first_valid_index()] <= (1 - threshold) * 100).all() if interest.first_valid_index() != None else False

t0_adi_quantile = data.query("time == 0")["area_deprivation_idx"].quantile(threshold)

def g2 (time_points):
  interest = time_points["area_deprivation_idx"]
  return (interest[interest.first_valid_index()] >= t0_adi_quantile).all() if interest.first_valid_index() != None else False

if national:
  adis = grouping.filter(g1)
else:
  print(f"10% quantile: {t0_adi_quantile}")
  adis = grouping.filter(g2)

print(adis["subject"].nunique())
adis

9658


Unnamed: 0,subject,time,sex,parent_income,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,...,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,asd_diagnosis,schizophrenia_diagnosis,ale_ksads_sum_score_p
3,NDAR_INV00BD7VDC,0,1.0,10.0,,,2.0,-0.517482,1.504379,3.005936,...,0.0,0.0,,,0.0,12.0,18.0,0.0,0.0,0.0
4,NDAR_INV00CY2MDM,0,1.0,6.0,,,1.0,-0.870376,1.065676,-1.389998,...,0.0,0.0,,,0.0,10.0,8.0,0.0,0.0,2.0
5,NDAR_INV00HEV6HB,0,1.0,,,,1.0,-0.870376,-1.566543,-0.510811,...,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,0.0,0.0
6,NDAR_INV00J52GPG,0,1.0,6.0,,,1.0,0.776462,-0.542902,0.638895,...,0.0,0.0,,1.0,0.0,0.0,,0.0,0.0,0.0
7,NDAR_INV00LH735Y,0,1.0,6.0,,,1.0,-0.517482,-0.542902,-1.254739,...,0.0,0.0,,,0.0,6.0,12.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,1.0,9.0,0.008970,-0.004988,,-0.864139,2.060190,0.383797,...,0.0,0.0,,,1.0,3.0,,0.0,0.0,
4750,NDAR_INVZZPKBDAC,4,2.0,10.0,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,...,0.0,0.0,,,0.0,1.0,14.0,0.0,0.0,
4751,NDAR_INVZZZ2ALR6,4,2.0,10.0,-0.973298,0.979193,,0.718683,-0.740938,2.614582,...,0.0,0.0,,,0.0,11.0,10.0,0.0,0.0,
4752,NDAR_INVZZZNB0XC,4,2.0,3.0,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,...,0.0,0.0,,1.0,1.0,2.0,12.0,0.0,0.0,


In [24]:
def filter_var_func (x):
  return lambda time_points: not time_points[x].any(axis=None)

In [25]:
# family conflict

fam_conflict = grouping.filter(filter_var_func(["frequent_family_conflict_p"]))
print(fam_conflict["subject"].nunique())
fam_conflict

8027


Unnamed: 0,subject,time,sex,parent_income,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,...,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,asd_diagnosis,schizophrenia_diagnosis,ale_ksads_sum_score_p
1,NDAR_INV005V6D2C,0,2.0,,,,1.0,0.894093,-1.566543,-0.037403,...,0.0,0.0,,,0.0,9.0,9.0,0.0,0.0,0.0
2,NDAR_INV007W6H7B,0,1.0,10.0,,,2.0,0.894093,-0.250434,0.436005,...,0.0,0.0,,,0.0,12.0,21.0,0.0,0.0,1.0
3,NDAR_INV00BD7VDC,0,1.0,10.0,,,2.0,-0.517482,1.504379,3.005936,...,0.0,0.0,,,0.0,12.0,18.0,0.0,0.0,0.0
5,NDAR_INV00HEV6HB,0,1.0,,,,1.0,-0.870376,-1.566543,-0.510811,...,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,0.0,0.0
8,NDAR_INV00LJVZK2,0,1.0,1.0,,,1.0,-1.458532,-0.250434,-0.510811,...,0.0,0.0,,3.0,0.0,12.0,11.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4748,NDAR_INVZZLZCKAY,4,2.0,9.0,0.401878,-0.398661,,-0.990765,-0.167980,-0.235865,...,0.0,0.0,,3.0,0.0,6.0,10.0,0.0,0.0,
4749,NDAR_INVZZNX6W2P,4,1.0,9.0,0.008970,-0.004988,,-0.864139,2.060190,0.383797,...,0.0,0.0,,,1.0,3.0,,0.0,0.0,
4751,NDAR_INVZZZ2ALR6,4,2.0,10.0,-0.973298,0.979193,,0.718683,-0.740938,2.614582,...,0.0,0.0,,,0.0,11.0,10.0,0.0,0.0,
4752,NDAR_INVZZZNB0XC,4,2.0,3.0,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,...,0.0,0.0,,1.0,1.0,2.0,12.0,0.0,0.0,


In [26]:
# severe mental health
# ksads_1_842_p (MDD_past_parent) (binary)
# ksads_10_870_p (GAD_Past_Parent) (binary)

severe_mh = grouping.filter(filter_var_func(["MDD_past_parent", "GAD_Past_Parent"]))
print(severe_mh["subject"].nunique())
severe_mh

10725


Unnamed: 0,subject,time,sex,parent_income,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,...,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,asd_diagnosis,schizophrenia_diagnosis,ale_ksads_sum_score_p
1,NDAR_INV005V6D2C,0,2.0,,,,1.0,0.894093,-1.566543,-0.037403,...,0.0,0.0,,,0.0,9.0,9.0,0.0,0.0,0.0
3,NDAR_INV00BD7VDC,0,1.0,10.0,,,2.0,-0.517482,1.504379,3.005936,...,0.0,0.0,,,0.0,12.0,18.0,0.0,0.0,0.0
4,NDAR_INV00CY2MDM,0,1.0,6.0,,,1.0,-0.870376,1.065676,-1.389998,...,0.0,0.0,,,0.0,10.0,8.0,0.0,0.0,2.0
5,NDAR_INV00HEV6HB,0,1.0,,,,1.0,-0.870376,-1.566543,-0.510811,...,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,0.0,0.0
6,NDAR_INV00J52GPG,0,1.0,6.0,,,1.0,0.776462,-0.542902,0.638895,...,0.0,0.0,,1.0,0.0,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,1.0,9.0,0.008970,-0.004988,,-0.864139,2.060190,0.383797,...,0.0,0.0,,,1.0,3.0,,0.0,0.0,
4750,NDAR_INVZZPKBDAC,4,2.0,10.0,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,...,0.0,0.0,,,0.0,1.0,14.0,0.0,0.0,
4751,NDAR_INVZZZ2ALR6,4,2.0,10.0,-0.973298,0.979193,,0.718683,-0.740938,2.614582,...,0.0,0.0,,,0.0,11.0,10.0,0.0,0.0,
4752,NDAR_INVZZZNB0XC,4,2.0,3.0,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,...,0.0,0.0,,1.0,1.0,2.0,12.0,0.0,0.0,


In [27]:
# current drug use
# famhx_ss_fath_prob_dg_p (father_druguse): 2 values (0-1) in t0
# famhx_ss_moth_prob_dg_p (mother_druguse): 2 values (0-1) in t0

drug_use = grouping.filter(filter_var_func(["father_druguse", "mother_druguse"]))
print(drug_use["subject"].nunique())
drug_use

10593


Unnamed: 0,subject,time,sex,parent_income,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,...,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,asd_diagnosis,schizophrenia_diagnosis,ale_ksads_sum_score_p
1,NDAR_INV005V6D2C,0,2.0,,,,1.0,0.894093,-1.566543,-0.037403,...,0.0,0.0,,,0.0,9.0,9.0,0.0,0.0,0.0
2,NDAR_INV007W6H7B,0,1.0,10.0,,,2.0,0.894093,-0.250434,0.436005,...,0.0,0.0,,,0.0,12.0,21.0,0.0,0.0,1.0
3,NDAR_INV00BD7VDC,0,1.0,10.0,,,2.0,-0.517482,1.504379,3.005936,...,0.0,0.0,,,0.0,12.0,18.0,0.0,0.0,0.0
5,NDAR_INV00HEV6HB,0,1.0,,,,1.0,-0.870376,-1.566543,-0.510811,...,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,0.0,0.0
6,NDAR_INV00J52GPG,0,1.0,6.0,,,1.0,0.776462,-0.542902,0.638895,...,0.0,0.0,,1.0,0.0,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,1.0,9.0,0.008970,-0.004988,,-0.864139,2.060190,0.383797,...,0.0,0.0,,,1.0,3.0,,0.0,0.0,
4750,NDAR_INVZZPKBDAC,4,2.0,10.0,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,...,0.0,0.0,,,0.0,1.0,14.0,0.0,0.0,
4751,NDAR_INVZZZ2ALR6,4,2.0,10.0,-0.973298,0.979193,,0.718683,-0.740938,2.614582,...,0.0,0.0,,,0.0,11.0,10.0,0.0,0.0,
4752,NDAR_INVZZZNB0XC,4,2.0,3.0,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,...,0.0,0.0,,1.0,1.0,2.0,12.0,0.0,0.0,


In [28]:
# ale_threshold_4 
# ale_threshold_5

ale_threshold_4_included = final_clean_data["subject"].isin(self_report_ale_4["subject"]).astype(int)
ale_threshold_5_included = final_clean_data["subject"].isin(self_report_ale_5["subject"]).astype(int)

ale_threshold_5_included_parent = final_clean_data["subject"].isin(parent_report_ale_5_more_than_2["subject"]).astype(int)
ale_threshold_4_included_parent = final_clean_data["subject"].isin(parent_report_ale_4_more_than_2["subject"]).astype(int)

parent_report_included = final_clean_data["subject"].isin(parent_reports["subject"]).astype(int)
adi_included = final_clean_data["subject"].isin(adis["subject"]).astype(int)
family_conflict_included = final_clean_data["subject"].isin(fam_conflict["subject"]).astype(int)
severe_mh_included = final_clean_data["subject"].isin(severe_mh["subject"]).astype(int)
drug_use_included = final_clean_data["subject"].isin(drug_use["subject"]).astype(int)

ale_threshold_4_included_leq_1 = final_clean_data["subject"].isin(parent_report_ale_4_more_than_1["subject"]).astype(int)
ale_threshold_5_included_leq_1 = final_clean_data["subject"].isin(parent_report_ale_5_more_than_1["subject"]).astype(int)

ale_threshold_4_included_leq_0 = final_clean_data["subject"].isin(parent_report_ale_4_more_than_0["subject"]).astype(int)
ale_threshold_5_included_leq_0 = final_clean_data["subject"].isin(parent_report_ale_5_more_than_0["subject"]).astype(int)

final_clean_data["included_ale_self_report_threshold_4"] = ale_threshold_4_included
final_clean_data["included_ale_self_report_threshold_5"] = ale_threshold_5_included
final_clean_data["included_ale_parent_4_leq_2"] = ale_threshold_4_included_parent
final_clean_data["included_ale_parent_5_leq_2"] = ale_threshold_5_included_parent

final_clean_data["ale_leq_1_p_4"] = ale_threshold_4_included_leq_1
final_clean_data["ale_leq_1_p_5"] = ale_threshold_5_included_leq_1
final_clean_data["ale_leq_0_p_4"] = ale_threshold_4_included_leq_0
final_clean_data["ale_leq_0_p_5"] = ale_threshold_5_included_leq_0

final_clean_data["included_ale_parent_report"] = parent_report_included
final_clean_data["included_adi"] = adi_included
final_clean_data["included_family_conflict"] = family_conflict_included
final_clean_data["included_severe_mh"] = severe_mh_included
final_clean_data["included_drug_use"] = drug_use_included

In [29]:
# calculate kids with <= 1 self-reported ALE

final_clean_data["low_ale_children"] = (
  (final_clean_data["included_ale_self_report_threshold_4"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

In [30]:
# FINAL SAMPLES USING THE LATEST EXCLUSION CRITERIA

final_clean_data["low_ale_children_p"] = ( # children with <= 0 ales only using data from first 4 time points
  (final_clean_data["ale_leq_0_p_4"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

final_clean_data["low_ale_children_p_all_time"] = ( # children with <= 0 ales using data from all 5 time points
  (final_clean_data["ale_leq_0_p_5"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

final_clean_data["low_ale_children_p_1"] = ( # children with <= 1 ales only using data from first 4 time points
  (final_clean_data["ale_leq_1_p_4"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

final_clean_data["low_ale_children_p_all_time_1"] = ( # children with <= 1 ales using data from all 5 time points
  (final_clean_data["ale_leq_1_p_5"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

lasso
elastic net
random forest

### High ALE

### Making groups for logistic regression

In [31]:
make_groups_vars = [
  "tb_fluid", 
  "fitbit_veryactive_mins", 
  "fitbit_steps", 
  "socialmedia_hoursperday_k", 
  "area_deprivation_idx", 
  "parent_education", 
  "parent_age", 
  "weight", 
  "feelsafe_at_school_k", 
  "bdefs_lazy_p", 
  "easily_offended_p", 
  "bad_grades"]

def split_group (low_cutoff, mid_cutoff):
  def f (x):
    if x <= low_cutoff:
      return 0
    elif x > low_cutoff and x < mid_cutoff:
      return 1
    else:
      return 2
  return f

for var in make_groups_vars:
  lower_cutoff = final_clean_data[var].quantile(0.25)
  middle_cutoff = final_clean_data[var].quantile(0.75)
  print(f"[{lower_cutoff}, {middle_cutoff}]")
  grouped = final_clean_data[var].map(split_group(lower_cutoff, middle_cutoff), na_action='ignore')
  final_clean_data[var + "_grouped"] = grouped

[84.0, 107.0]
[-0.6151926307079078, 0.2132448026514952]
[-0.6924284284575021, 0.5749511432930807]
[0.0, 3.0]
[87.849815, 108.39492]
[15.0, 19.0]
[35.0, 44.0]
[75.89375, 116.0]
[3.0, 4.0]
[1.0, 2.0]
[2.0, 4.0]
[2.0, 5.0]


In [32]:
# make cols for cbcl score percentiles
# new cols: high_dep_cbcl, high_anx_cbcl

q = 0.8

def stratify (cutoff):
  def f (x):
    return int(x >= cutoff)
  return f

dep_cutoff = final_clean_data["depress_D_p"].quantile(q)
print(f"dep: {dep_cutoff}")
final_clean_data["high_dep_cbcl"] = final_clean_data["depress_D_p"].map(stratify(dep_cutoff), na_action='ignore')

anx_cutoff = final_clean_data["anxdisord_D_p"].quantile(q)
print(f"anx: {anx_cutoff}")
final_clean_data["high_anx_cbcl"] = final_clean_data["anxdisord_D_p"].map(stratify(anx_cutoff), na_action='ignore')

dep: 0.48910079439047693
anx: 0.40895996771471055


### Save

In [33]:
final_clean_data = final_clean_data.convert_dtypes(convert_string=False)
print("final data has shape: " + str(final_clean_data.shape))
final_clean_data.to_csv(fname := f'{output_dir}clean/CLEAN_ABCD_5.1_panel_{todays_date}.csv')
print(f"File saved as {fname}")

final data has shape: (49134, 633)
File saved as output/clean/CLEAN_ABCD_5.1_panel_20240909.csv


### Save as .sav (with levels annotated)

In [34]:
import label_levels
import pyreadstat

labels = label_levels.get_labels()
pyreadstat.write_sav(final_clean_data, f'{output_dir}/clean/abcd_data_labeled_{todays_date}.sav', variable_value_labels=labels)

too many values to unpack (expected 2)
If none, enter "0".  If you Don't know, please choose "Don't know" in what follows below Si la respuesta es "ninguno", ingrese "0".  Si no sabe, por favor escoja "no se" en lo que sigue abajo. / famhx_1   == 1 &&  isNull(fhx_3a_dk_p___999)
too many values to unpack (expected 2)
If none, enter "0".  If you Don't know, please choose "Don't know" in what follows below Si la respuesta es "ninguno", ingrese "0".  Si no sabe, por favor escoja "no se" en lo que sigue abajo. / famhx_1   == 1 &&  isNull(fhx_3b_dk_p___999)
too many values to unpack (expected 2)
0 = No; 1 = Yes / tlfb_tob_l   == '1' ||  tlfb_tob_c_l   == '1'
too many values to unpack (expected 2)
0 = No; 1 = Yes / tlfb_mj_l   == '1' ||   tlfb_mj_c_l   == '1'
too many values to unpack (expected 2)
0 = No; 1 = Yes / tlfb_alc_l   == '1' ||  tlfb_alc_c_l   == '1'
too many values to unpack (expected 2)
If he/she was not breast fed, enter "0", 1 year = 12 months, 2 years = 24 months, 3 years = 36 

In [35]:
# sample size with all exclusion criteria

using_latest_time_point = True

exclusion_filter = (
  (final_clean_data["included_ale_self_report_threshold_5" if using_latest_time_point else "included_ale_self_report_threshold_4"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_family_conflict"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

subset = final_clean_data[exclusion_filter]
print(f"Sample size using all exclusion criteria is: {subset['subject'].nunique()}")

Sample size using all exclusion criteria is: 665


In [36]:
final_clean_data["b_lifeevents_ss_p"].median(skipna=True)

np.float64(0.0)

In [37]:
ale_vars = ["ale_leq_0_p_4", "ale_leq_1_p_4", "ale_leq_0_p_5", "ale_leq_1_p_5"]

for v in ale_vars:
  exclusion_filter = (
    (final_clean_data[v] == 1) &
    (final_clean_data["included_ale_parent_report"] == 1) &
    (final_clean_data["included_adi"] == 1) &
    (final_clean_data["included_severe_mh"] == 1) &
    (final_clean_data["included_drug_use"] == 1))

  subset = final_clean_data[exclusion_filter]
  print(f"using {v}: {subset['subject'].nunique()}")

using ale_leq_0_p_4: 2075
using ale_leq_1_p_4: 3484
using ale_leq_0_p_5: 831
using ale_leq_1_p_5: 1517


Sample sizes after removing bottom 20% ADI

exclude >2 ales:
  4 time points: 3200
  5 time points: 1420

exclude >1 ales:
  4 time points: 2697
  5 time points: 1133

exclude >0 ales:
  4 time points: 1585
  5 time points: 618

In [38]:
final_clean_data[final_clean_data["low_ale_children"]]["subject"].nunique()

2213