### Main data cleaning

The data cleaning pipeline has four stages:
1. remove outliers using a) interquartile ratio with cutoff of 2.5 and b) "natural" bounds from other research or experimental design
2. standardize continuous variables using StandardScaler
3. calculate interaction terms between anxiety and depression
4. stratify bad life events (top/bottom 15%) and (maybe) calculate interactions between bad and good life events
5. propogate fixed demographic variables captured at baseline across subsequent timelines

The target variables for each step are specified in `data_cleaning.py` and interaction functions are defined in `interactions.py`. It is straightforward to edit either of these files to add/remove variables and interactions.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import data_cleaning
import interactions

In [2]:
output_dir = "output/"

In [3]:
# Read in raw data
raw_data_file_name = "RAW_ABCD_5.1_panel_20240808.csv"
data = pd.read_csv(output_dir + raw_data_file_name, index_col=0, low_memory=False)

### Remove outliers

In [4]:
# for IQR outlier removal
outliers = list(data_cleaning.outlier_vars.keys())
data[outliers] = data_cleaning.remove_outlier_IQR(data[outliers], cutoff=2.5)

In [5]:
# for logical/prior bounds outlier removal
for var, bounds in data_cleaning.outlier_vars.items():
    data[var] = data_cleaning.remove_outlier_bounds(data[var], bounds)

In [6]:
t0 = data.query('time == 0').reset_index(drop=True)
t1 = data.query('time == 1').reset_index(drop=True)
t2 = data.query('time == 2').reset_index(drop=True)
t3 = data.query('time == 3').reset_index(drop=True)
t4 = data.query('time == 4').reset_index(drop=True)

### Standardize

In [7]:
t0[data_cleaning.standardize_vars] = data_cleaning.standardize(t0[data_cleaning.standardize_vars])
t1[data_cleaning.standardize_vars] = data_cleaning.standardize(t1[data_cleaning.standardize_vars])
t2[data_cleaning.standardize_vars] = data_cleaning.standardize(t2[data_cleaning.standardize_vars])
t3[data_cleaning.standardize_vars] = data_cleaning.standardize(t3[data_cleaning.standardize_vars])
t4[data_cleaning.standardize_vars] = data_cleaning.standardize(t4[data_cleaning.standardize_vars])

# Errors mean that certain columns are all zero?

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


### Calculate interactions

In [8]:
t0 = interactions.add_interactions(t0).copy()
t1 = interactions.add_interactions(t1).copy()
t2 = interactions.add_interactions(t2).copy()
t3 = interactions.add_interactions(t3).copy()
t4 = interactions.add_interactions(t4).copy()

  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df.loc[:, 'asd_ssrs_sum'] = df.loc[:, 'avoids_eyecontact_p'] + df.loc[:, 'narrow_interests_p'] + df.loc[:, 'sensory_sensitivity_p'] + df.loc[:,'concentration_on_parts_p'] + df.loc[:, 'face_understanding']
  return _nanquantile_unchecked(
  df['asdadhd_c'] = asdadhd_interaction(df, lower, upper)
  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  return _nanquantile_unchecked(
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df.loc[:, 'asd_ssrs_sum'] = df.loc[:, 'avoids_eyecontact_p'] + df.loc[:, 'narrow_interests_p'] 

### Propogate fixed variables

In [9]:
fixed = t0[['subject'] + data_cleaning.fixed_vars].copy()
t0.drop(columns=data_cleaning.fixed_vars, inplace=True)
t1.drop(columns=data_cleaning.fixed_vars, inplace=True)
t2.drop(columns=data_cleaning.fixed_vars, inplace=True)
t3.drop(columns=data_cleaning.fixed_vars, inplace=True)
t4.drop(columns=data_cleaning.fixed_vars, inplace=True)

In [10]:
t0 = pd.merge(t0, fixed, on='subject', how='left')
t1 = pd.merge(t1, fixed, on='subject', how='left')
t2 = pd.merge(t2, fixed, on='subject', how='left')
t3 = pd.merge(t3, fixed, on='subject', how='left')
t4 = pd.merge(t4, fixed, on='subject', how='left')

### Join time points

In [11]:
todays_date = date.today().strftime('%Y%m%d')
todays_date

'20240808'

In [12]:
final_clean_data = pd.concat([t0, t1, t2, t3, t4])

### Statify ALEs and calculate parent ale sum score

In [13]:
final_clean_data["ale_ksads_sum_score_p"] = (
  final_clean_data["car_accident_hurt_p"] +
  final_clean_data["big_accident_need_treatment_p"] +
  final_clean_data["fire_victim_p"] +
  final_clean_data["natural_disaster_victim_p"] +
  final_clean_data["terrorism_victim_p"] +
  final_clean_data["war_death_witness_p"] +
  final_clean_data["stabbing_shooting_witness_p"] +
  final_clean_data["stabbing_shooting_victim_community_p"] +
  final_clean_data["stabbing_shooting_victim_home_p"] +
  final_clean_data["beating_victim_home_p"] +
  final_clean_data["stranger_threatened_child_victim_p"] +
  final_clean_data["family_threatened_child_victim_p"] +
  final_clean_data["adult_family_fighting_victim_p"] +
  final_clean_data["domestic_child_sexually_abuse_victim_p"] +
  final_clean_data["foreign_child_sexually_abuse_victim_p"] +
  final_clean_data["peer_child_sexually_abuse_victim_p"] +
  final_clean_data["sudden_death_in_family_p"])

In [14]:
# calculate kids with <= 1 self-reported ALE

final_clean_data["low_ale_children"] = (final_clean_data["b_lifeevents_affected_ss_k"] <= 2).astype(int)

### Exclusion

In [15]:
grouping = final_clean_data.groupby("subject")

In [16]:
# self-reports

self_report_ale_4 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_k"].notna().sum() >= 4) and (time_points["b_lifeevents_ss_k"].iloc[:4] <= 2).all())
self_report_ale_5 = grouping.filter(lambda time_points: (time_points["b_lifeevents_ss_k"].notna().sum() == 5) and (time_points["b_lifeevents_ss_k"] <= 2).all())
print(self_report_ale_4["subject"].nunique())
print(self_report_ale_5["subject"].nunique())

3996
1576


In [17]:
# parent reports

def f (time_points):
  interest = time_points["ale_ksads_sum_score_p"]
  return (interest[interest.last_valid_index()] == 0).all() if interest.last_valid_index() != None else False

parent_reports = grouping.filter(f)
print(parent_reports["subject"].nunique())

7711


In [18]:
# national -> compute using national percentages
# otherwise compute using sample proportion

national = True
adis = None
threshold = 0.1

def g1 (time_points):
  interest = time_points["area_deprivation_idx_perc"]
  return (interest[interest.first_valid_index()] >= threshold * 100).all() if interest.first_valid_index() != None else False

t0_adi_quantile = data.query("time == 0")["area_deprivation_idx"].quantile(threshold)

def g2 (time_points):
  interest = time_points["area_deprivation_idx"]
  return (interest[interest.first_valid_index()] >= t0_adi_quantile).all() if interest.first_valid_index() != None else False

if national:
  adis = grouping.filter(g1)
else:
  print(f"10% quantile: {t0_adi_quantile}")
  adis = grouping.filter(g2)

print(adis["subject"].nunique())
adis

9741


Unnamed: 0,subject,time,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,tb_cardsort,tb_pattern,...,weed_during_pregnancy_p,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,ale_ksads_sum_score_p,low_ale_children
16,NDAR_INV0191C80U,0,,,2.0,0.129490,0.115152,0.638895,0.809813,0.236514,...,0.0,0.0,0.0,,,1.0,,11.0,3.0,1
17,NDAR_INV019DXLU4,0,,,2.0,-0.635113,-1.200957,-1.119479,-1.564109,-2.255208,...,0.0,0.0,0.0,,1.0,0.0,6.0,10.0,2.0,1
18,NDAR_INV01AJ15N9,0,,,2.0,0.305937,-0.908488,0.436005,-0.179321,-1.077303,...,0.0,0.0,0.0,,,1.0,14.0,12.0,0.0,1
19,NDAR_INV01D03VR7,0,,,1.0,,,,,,...,0.0,0.0,0.0,,,0.0,12.0,10.0,1.0,1
20,NDAR_INV01ELX9L6,0,,,1.0,-1.752610,0.115152,-1.254739,-0.443090,-0.125919,...,0.0,0.0,0.0,,,0.0,0.0,12.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4748,NDAR_INVZZLZCKAY,4,0.401878,-0.398661,,-0.990765,-0.167980,-0.235865,,-1.151109,...,0.0,0.0,0.0,,3.0,0.0,6.0,10.0,,0
4749,NDAR_INVZZNX6W2P,4,0.008970,-0.004988,,-0.864139,2.060190,0.383797,,1.377114,...,0.0,0.0,0.0,,,1.0,3.0,,,1
4750,NDAR_INVZZPKBDAC,4,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,,-1.806575,...,0.0,0.0,0.0,,,0.0,1.0,14.0,,1
4752,NDAR_INVZZZNB0XC,4,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,,0.487554,...,0.0,0.0,0.0,,1.0,1.0,2.0,12.0,,0


In [19]:
def filter_var_func (x):
  return lambda time_points: not time_points[x].any(axis=None)

In [20]:
# family conflict

fam_conflict = grouping.filter(filter_var_func(["frequent_family_conflict_p"]))
print(fam_conflict["subject"].nunique())
fam_conflict

8031


Unnamed: 0,subject,time,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,tb_cardsort,tb_pattern,...,weed_during_pregnancy_p,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,ale_ksads_sum_score_p,low_ale_children
0,NDAR_INV005V6D2C,0,,,1.0,0.894093,-1.566543,-0.037403,0.414160,0.417730,...,0.0,0.0,0.0,,,0.0,9.0,9.0,0.0,1
1,NDAR_INV007W6H7B,0,,,2.0,0.894093,-0.250434,0.436005,0.414160,0.055298,...,0.0,0.0,0.0,,,0.0,12.0,21.0,1.0,1
2,NDAR_INV00BD7VDC,0,,,2.0,-0.517482,1.504379,3.005936,-0.443090,1.051986,...,0.0,0.0,0.0,,,0.0,12.0,18.0,0.0,1
4,NDAR_INV00HEV6HB,0,,,1.0,-0.870376,-1.566543,-0.510811,-0.179321,0.417730,...,0.0,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,1
7,NDAR_INV00LJVZK2,0,,,1.0,-1.458532,-0.250434,-0.510811,-1.300340,0.417730,...,0.0,0.0,0.0,,3.0,0.0,12.0,11.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4748,NDAR_INVZZLZCKAY,4,0.401878,-0.398661,,-0.990765,-0.167980,-0.235865,,-1.151109,...,0.0,0.0,0.0,,3.0,0.0,6.0,10.0,,0
4749,NDAR_INVZZNX6W2P,4,0.008970,-0.004988,,-0.864139,2.060190,0.383797,,1.377114,...,0.0,0.0,0.0,,,1.0,3.0,,,1
4751,NDAR_INVZZZ2ALR6,4,-0.973298,0.979193,,0.718683,-0.740938,2.614582,,-0.027454,...,0.0,0.0,0.0,,,0.0,11.0,10.0,,0
4752,NDAR_INVZZZNB0XC,4,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,,0.487554,...,0.0,0.0,0.0,,1.0,1.0,2.0,12.0,,0


In [21]:
# severe mental health
# ksads_1_842_p (MDD_past_parent) (binary)
# ksads_10_870_p (GAD_Past_Parent) (binary)

severe_mh = grouping.filter(filter_var_func(["MDD_past_parent", "GAD_Past_Parent"]))
print(severe_mh["subject"].nunique())
severe_mh

10729


Unnamed: 0,subject,time,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,tb_cardsort,tb_pattern,...,weed_during_pregnancy_p,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,ale_ksads_sum_score_p,low_ale_children
0,NDAR_INV005V6D2C,0,,,1.0,0.894093,-1.566543,-0.037403,0.414160,0.417730,...,0.0,0.0,0.0,,,0.0,9.0,9.0,0.0,1
2,NDAR_INV00BD7VDC,0,,,2.0,-0.517482,1.504379,3.005936,-0.443090,1.051986,...,0.0,0.0,0.0,,,0.0,12.0,18.0,0.0,1
3,NDAR_INV00CY2MDM,0,,,1.0,-0.870376,1.065676,-1.389998,-0.179321,-0.352439,...,0.0,0.0,0.0,,,0.0,10.0,8.0,2.0,1
4,NDAR_INV00HEV6HB,0,,,1.0,-0.870376,-1.566543,-0.510811,-0.179321,0.417730,...,0.0,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,1
5,NDAR_INV00J52GPG,0,,,1.0,0.776462,-0.542902,0.638895,0.216333,-0.125919,...,0.0,0.0,0.0,,1.0,0.0,0.0,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,0.008970,-0.004988,,-0.864139,2.060190,0.383797,,1.377114,...,0.0,0.0,0.0,,,1.0,3.0,,,1
4750,NDAR_INVZZPKBDAC,4,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,,-1.806575,...,0.0,0.0,0.0,,,0.0,1.0,14.0,,1
4751,NDAR_INVZZZ2ALR6,4,-0.973298,0.979193,,0.718683,-0.740938,2.614582,,-0.027454,...,0.0,0.0,0.0,,,0.0,11.0,10.0,,0
4752,NDAR_INVZZZNB0XC,4,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,,0.487554,...,0.0,0.0,0.0,,1.0,1.0,2.0,12.0,,0


In [22]:
# current drug use
# famhx_ss_fath_prob_dg_p (father_druguse): 2 values (0-1) in t0
# famhx_ss_moth_prob_dg_p (mother_druguse): 2 values (0-1) in t0

drug_use = grouping.filter(filter_var_func(["father_druguse", "mother_druguse"]))
print(drug_use["subject"].nunique())
drug_use

10597


Unnamed: 0,subject,time,gd_safebets,gd_riskybets,cct,tb_picvocab,tb_flanker,tb_list,tb_cardsort,tb_pattern,...,weed_during_pregnancy_p,cocaine_during_pregnancy_p,heroin_during_pregnancy_p,drugs_during_pregnancy_p,caffeine_during_pregnancy_p,premature_birth_p,months_breastfed_p,firstwords_months_p,ale_ksads_sum_score_p,low_ale_children
0,NDAR_INV005V6D2C,0,,,1.0,0.894093,-1.566543,-0.037403,0.414160,0.417730,...,0.0,0.0,0.0,,,0.0,9.0,9.0,0.0,1
1,NDAR_INV007W6H7B,0,,,2.0,0.894093,-0.250434,0.436005,0.414160,0.055298,...,0.0,0.0,0.0,,,0.0,12.0,21.0,1.0,1
2,NDAR_INV00BD7VDC,0,,,2.0,-0.517482,1.504379,3.005936,-0.443090,1.051986,...,0.0,0.0,0.0,,,0.0,12.0,18.0,0.0,1
4,NDAR_INV00HEV6HB,0,,,1.0,-0.870376,-1.566543,-0.510811,-0.179321,0.417730,...,0.0,0.0,0.0,,1.0,0.0,0.0,12.0,0.0,1
5,NDAR_INV00J52GPG,0,,,1.0,0.776462,-0.542902,0.638895,0.216333,-0.125919,...,0.0,0.0,0.0,,1.0,0.0,0.0,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,NDAR_INVZZNX6W2P,4,0.008970,-0.004988,,-0.864139,2.060190,0.383797,,1.377114,...,0.0,0.0,0.0,,,1.0,3.0,,,1
4750,NDAR_INVZZPKBDAC,4,1.187693,-1.186006,,-0.990765,0.341316,-0.669628,,-1.806575,...,0.0,0.0,0.0,,,0.0,1.0,14.0,,1
4751,NDAR_INVZZZ2ALR6,4,-0.973298,0.979193,,0.718683,-0.740938,2.614582,,-0.027454,...,0.0,0.0,0.0,,,0.0,11.0,10.0,,0
4752,NDAR_INVZZZNB0XC,4,0.991239,-0.989170,,-1.497268,-0.167980,-1.041426,,0.487554,...,0.0,0.0,0.0,,1.0,1.0,2.0,12.0,,0


In [23]:
# ale_threshold_4 
# ale_threshold_5

ale_threshold_4_included = final_clean_data["subject"].isin(self_report_ale_4["subject"]).astype(int)
ale_threshold_5_included = final_clean_data["subject"].isin(self_report_ale_5["subject"]).astype(int)

parent_report_included = final_clean_data["subject"].isin(parent_reports["subject"]).astype(int)
adi_included = final_clean_data["subject"].isin(adis["subject"]).astype(int)
family_conflict_included = final_clean_data["subject"].isin(fam_conflict["subject"]).astype(int)
severe_mh_included = final_clean_data["subject"].isin(severe_mh["subject"]).astype(int)
drug_use_included = final_clean_data["subject"].isin(drug_use["subject"]).astype(int)

final_clean_data["included_ale_self_report_threshold_4"] = ale_threshold_4_included
final_clean_data["included_ale_self_report_threshold_5"] = ale_threshold_5_included
final_clean_data["included_ale_parent_report"] = parent_report_included
final_clean_data["included_adi"] = adi_included
final_clean_data["included_family_conflict"] = family_conflict_included
final_clean_data["included_severe_mh"] = severe_mh_included
final_clean_data["included_drug_use"] = drug_use_included

### Save

In [24]:
final_clean_data = final_clean_data.convert_dtypes(convert_string=False)
print("final data has shape: " + str(final_clean_data.shape))
final_clean_data.to_csv(fname := f'{output_dir}CLEAN_ABCD_5.1_panel_{todays_date}.csv')
print(f"File saved as {fname}")

final data has shape: (49150, 605)
File saved as output/CLEAN_ABCD_5.1_panel_20240808.csv


In [25]:
# sample size with all exclusion criteria

using_latest_time_point = True

exclusion_filter = (
  (final_clean_data["included_ale_self_report_threshold_5" if using_latest_time_point else "included_ale_self_report_threshold_4"] == 1) &
  (final_clean_data["included_ale_parent_report"] == 1) &
  (final_clean_data["included_adi"] == 1) &
  (final_clean_data["included_family_conflict"] == 1) &
  (final_clean_data["included_severe_mh"] == 1) &
  (final_clean_data["included_drug_use"] == 1))

subset = final_clean_data[exclusion_filter]
print(f"Sample size using all exclusion criteria is: {subset['subject'].nunique()}")

Sample size using all exclusion criteria is: 586
