### Main data cleaning

The data cleaning pipeline has four stages:
1. remove outliers using a) interquartile ratio with cutoff of 2.5 and b) "natural" bounds from other research or experimental design
2. standardize continuous variables using StandardScaler
3. calculate interaction terms between anxiety and depression
4. stratify bad life events (top/bottom 15%) and (maybe) calculate interactions between bad and good life events
5. propogate fixed demographic variables captured at baseline across subsequent timelines

The target variables for each step are specified in `data_cleaning.py` and interaction functions are defined in `interactions.py`. It is straightforward to edit either of these files to add/remove variables and interactions.

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import data_cleaning
import interactions

In [28]:
output_dir = "output/"

In [29]:
# Read in raw data
raw_data_file_name = "RAW_ABCD_5.0_panel_20240805.csv"
data = pd.read_csv(output_dir + raw_data_file_name, index_col=0, low_memory=False)

### Remove outliers

In [30]:
# for IQR outlier removal
outliers = list(data_cleaning.outlier_vars.keys())
data[outliers] = data_cleaning.remove_outlier_IQR(data[outliers], cutoff=2.5)

In [31]:
# for logical/prior bounds outlier removal
for var, bounds in data_cleaning.outlier_vars.items():
    data[var] = data_cleaning.remove_outlier_bounds(data[var], bounds)

In [32]:
t0 = data.query('time == 0').reset_index(drop=True)
t1 = data.query('time == 1').reset_index(drop=True)
t2 = data.query('time == 2').reset_index(drop=True)
t3 = data.query('time == 3').reset_index(drop=True)
t4 = data.query('time == 4').reset_index(drop=True)

### Standardize

In [33]:
t0[data_cleaning.standardize_vars] = data_cleaning.standardize(t0[data_cleaning.standardize_vars])
t1[data_cleaning.standardize_vars] = data_cleaning.standardize(t1[data_cleaning.standardize_vars])
t2[data_cleaning.standardize_vars] = data_cleaning.standardize(t2[data_cleaning.standardize_vars])
t3[data_cleaning.standardize_vars] = data_cleaning.standardize(t3[data_cleaning.standardize_vars])
t4[data_cleaning.standardize_vars] = data_cleaning.standardize(t4[data_cleaning.standardize_vars])

# Errors mean that certain columns are all zero?

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


### Calculate interactions

In [34]:
t0 = interactions.add_interactions(t0).copy()
t1 = interactions.add_interactions(t1).copy()
t2 = interactions.add_interactions(t2).copy()
t3 = interactions.add_interactions(t3).copy()
t4 = interactions.add_interactions(t4).copy()

  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  return _nanquantile_unchecked(
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, upper)
  df['anxadhd_c'] = anxadhd_interaction(df, lower, upper)
  df['anxocd_c'] = anxocd_interaction(df, lower, upper)
  df['crysflu_c'] = crysflu_interaction(df, lower, upper)
  df['depanx_c'] = depanx_interaction(df, lower, upper)
  df['depadhd_c'] = depadhd_interaction(df, lower, up

### Propogate fixed variables

In [35]:
fixed = t0[['subject'] + data_cleaning.fixed_vars].copy()
t0.drop(columns=data_cleaning.fixed_vars, inplace=True)
t1.drop(columns=data_cleaning.fixed_vars, inplace=True)
t2.drop(columns=data_cleaning.fixed_vars, inplace=True)
t3.drop(columns=data_cleaning.fixed_vars, inplace=True)
t4.drop(columns=data_cleaning.fixed_vars, inplace=True)

In [36]:
t0 = pd.merge(t0, fixed, on='subject', how='outer')
t1 = pd.merge(t1, fixed, on='subject', how='outer')
t2 = pd.merge(t2, fixed, on='subject', how='outer')
t3 = pd.merge(t3, fixed, on='subject', how='outer')
t4 = pd.merge(t4, fixed, on='subject', how='outer')

### Join time points and save

In [37]:
todays_date = date.today().strftime('%Y%m%d')

In [38]:
final_clean_data = pd.concat([t0, t1, t2, t3, t4])
final_clean_data = final_clean_data.convert_dtypes(convert_string=False)

In [39]:
print("final data has shape: " + str(final_clean_data.shape))
final_clean_data.to_csv(f'{output_dir}CLEAN_ABCD_5.1_panel_{todays_date}.csv')
print("File saved")

final data has shape: (59338, 590)
File saved
