### Main code to extract data

Before running the code, condense all necessary files into the "data" folder (running condense_files_from_folder.py will do that automatically, but you need to have the abcd data folder in this same directory)

In [1]:
import extraction
import json
import csv
import pandas as pd
import datetime

In [2]:
# load data
spec_data = None

with open("specifications/New Specifications 5.1.json") as f:
  spec_data = json.load(f)

In [3]:
# OPTIONAL: make sure that all cols are there

group_names = ["CognitiveSpecs", "DemoSpecs", "NeuralDataSpecs", "OutcomeSpecs"]

non_missing_cols = 0
missing_cols = 0

for name in group_names:
  for spec in spec_data[name]:
    with open("data/" + spec["filename"]) as f:
      file_data = csv.reader(f)
      col_names = next(file_data)
      for c in spec["cols"]:
        if c not in col_names:
          print(f"missing column: {c} not found in {spec['filename']}")
          missing_cols += 1
        else:
          non_missing_cols += 1

print(f"missing: {missing_cols}, not missing: {non_missing_cols}")

missing: 0, not missing: 586


In [4]:
output_dir = "output/"

time_periods = ['baseline', '1_year', '2_year', '3_year', '4_year']

baseline = None
one_year = None
two_year = None
three_year = None
four_year = None

for period in time_periods:
    tasks = extraction.get_cognitive_tasks(period=period, version="", spec_list=spec_data["CognitiveSpecs"])
    demo = extraction.get_demographics(period=period, version="", spec_list=spec_data["DemoSpecs"])
    neural = extraction.get_neuraldata(period=period, version="", spec_list=spec_data["NeuralDataSpecs"])
    outcomes = extraction.get_outcomes(period=period, version="", spec_list=spec_data["OutcomeSpecs"])
    
    data = pd.DataFrame([], columns=['subject']).astype("category")
    data = pd.merge(data, tasks, on='subject', how='outer')
    data = pd.merge(data, demo, on='subject', how='outer')
    data = pd.merge(data, neural, on='subject', how='outer')
    data = pd.merge(data, outcomes, on='subject', how='outer')

    filename = f'{output_dir}data_{period}.csv'
    data.to_csv(filename)

    print(f"Successfully collected data for time point: {period}, with shape {data.shape}, with filename {filename}")
    
    if period == "baseline":
        baseline = data.copy()
        baseline.insert(1, 'time', 0)
    elif period == '1_year':
        one_year = data.copy()
        one_year.insert(1, 'time', 1)
    elif period == '2_year':
        two_year = data.copy()
        two_year.insert(1, 'time', 2)  
    elif period == '3_year':
        three_year = data.copy()
        three_year.insert(1, 'time', 3)
    else:
        four_year = data.copy()
        four_year.insert(1, 'time', 4)

ERROR converting back from str to float in ph_p_meds.csv in column med1_rxnorm_p: could not convert string to float: '435 Albuterol'
ERROR converting back from str to float in ph_p_meds.csv in column med2_rxnorm_p: could not convert string to float: '1172147 Asmanex Inhalant Product'
Successfully collected data for time point: baseline, with shape (11868, 592), with filename output/data_baseline.csv
ERROR converting back from str to float in ph_p_meds.csv in column med1_rxnorm_p: could not convert string to float: '1152901 Amoxicillin Pill'
ERROR converting back from str to float in ph_p_meds.csv in column med2_rxnorm_p: could not convert string to float: '58930 Zyrtec'
Successfully collected data for time point: 1_year, with shape (11220, 592), with filename output/data_1_year.csv
ERROR converting back from str to float in ph_p_meds.csv in column med1_rxnorm_p: could not convert string to float: '711043 Vyvanse'
ERROR converting back from str to float in ph_p_meds.csv in column med2_r

  df = df.replace({555: np.nan, 777: np.nan, 888: np.nan, 999: np.nan})


Successfully collected data for time point: 4_year, with shape (4754, 592), with filename output/data_4_year.csv


In [5]:
todays_date = datetime.date.today().strftime('%Y%m%d')

### Save Data

In [6]:
# Save panel data
merged = pd.concat([baseline, one_year, two_year, three_year, four_year])
merged = merged.sort_values(by=['time', 'subject'])

print(f"Panel data has shape: {merged.shape}")
merged.to_csv(filename := f'{output_dir}RAW_ABCD_5.1_panel_{todays_date}.csv')
print(f"Saved to: {filename}")

Panel data has shape: (49151, 593)
Saved to: output/RAW_ABCD_5.1_panel_20240819.csv


In [7]:
merged["asd_diagnosis"]

0          0
1          0
2          0
3          0
4          0
        ... 
4749    <NA>
4750    <NA>
4751    <NA>
4752    <NA>
4753    <NA>
Name: asd_diagnosis, Length: 49151, dtype: Int64

In [8]:
# Save aggregated data (currently errors because there are non-numeric cols)
aggregated = merged.groupby(['subject']).mean().reset_index(drop=False)
aggregated.drop(columns=['time'], inplace=True)

print(f"Aggregated data has shape: {aggregated.shape}")
aggregated.to_csv(filename := f'{output_dir}RAW_task_demo_outcomes_aggregated_{todays_date}.csv')
print(f"Saved to: {filename}")

TypeError: agg function failed [how->mean,dtype->object]