# LEAP Wave 1,2 and 3 - sorting through missing data

### Globals

In [1]:
import warnings
warnings.filterwarnings('ignore')
from __future__ import print_function
import sys
import os
import re
import glob
import subprocess
import shutil
import pandas as pd
import shutil
import xmltodict
import io
import numpy as np

#define data directory
maindir = '/project_cephfs/3022035.06/'

### Overview DTI data across all 3 waves

In [2]:
# Enumerate leap wave directory names
leap_waves = ['LEAP_wave1', 'LEAP_wave2', 'LEAP_wave3']

# Dictionary to store results for each wave
dtifit_presence = {}

# Loop through each LEAP wave
for wave in leap_waves:
    rootdir = os.path.join(maindir, wave)

    # Get all subject directories (assuming numeric IDs)
    sub_dirs = [d for d in glob.glob(os.path.join(rootdir, '[0-9]*'))]
    sub_ids = [os.path.basename(d) for d in sub_dirs]

    # Find subjects with dtifit directory present
    dtifit_presence[wave] = [
        subid for subid, subpath in zip(sub_ids, sub_dirs)
        if os.path.exists(os.path.join(subpath, 'DWI', 'preprocessing', 'dtifit'))
    ]

    # Print summary
    print(f'Found {len(dtifit_presence[wave])} subjects with dtifit directory present in {wave}.')


Found 351 subjects with dtifit directory present in LEAP_wave1.
Found 213 subjects with dtifit directory present in LEAP_wave2.
Found 156 subjects with dtifit directory present in LEAP_wave3.


In [3]:
# Dictionary to store DataFrames for each wave
dtifit_dfs = {}

# Loop through each LEAP wave and store subject IDs in a DataFrame
for wave in leap_waves:
    dtifit_dfs[wave] = pd.DataFrame({"subjects": dtifit_presence[wave]})

# Unpack individual DataFrames for easier access
df_dtifit_wave1 = dtifit_dfs["LEAP_wave1"]
df_dtifit_wave1['subjects'] = df_dtifit_wave1['subjects'].astype(int)
df_dtifit_wave2 = dtifit_dfs["LEAP_wave2"]
df_dtifit_wave2['subjects'] = df_dtifit_wave2['subjects'].astype(int)
df_dtifit_wave3 = dtifit_dfs["LEAP_wave3"]
df_dtifit_wave3['subjects'] = df_dtifit_wave3['subjects'].astype(int)

### Overview Clinical data across all 3 waves

In [5]:
selected_symptoms = [
    "adi_social_total",  # Social interaction deficits (ADI-R)
    "adi_communication_total",  # Communication deficits (ADI-R)
    "adi_rrb_total",  # Restricted & repetitive behaviors (ADI-R)
    "css_total",  # Overall autism severity (Calibrated Severity Score)
    "sa_css",  # Social Affect severity (Calibrated Severity Score)
    "rrb_css",  # Restricted & Repetitive Behaviors severity (Calibrated Severity Score)
    "srs_tscore_combined",  # Social responsiveness difficulties (SRS Total Score)
    "adhd_inattentiv_parent",  # Parent-reported inattention symptoms (ADHD)
    "adhd_hyperimpul_parent",  # Parent-reported hyperactivity/impulsivity symptoms (ADHD)
    "rbs_total",  # Total repetitive behaviors (Repetitive Behavior Scale-Revised, RBS-R)
    "sdq_total_difficulties_p",  # Total behavioral difficulties (Strengths & Difficulties Questionnaire, SDQ)
    "beck_anx_adulta_self",  # Self-reported anxiety (Beck Anxiety Inventory, BAI)
    "beck_dep_adulta_self",  # Self-reported depression (Beck Depression Inventory, BDI)
    "ssp_total",  # Total sensory processing difficulties (Short Sensory Profile, SSP)
    "ssp_hype"  # Hyperresponsiveness to sensory stimuli (Sensory Processing Issue)
]


In [6]:
# # Load clinical data wave 1
# leap1_clin_path = '/project_cephfs/3022035.06/LEAP_clinical/LEAP_t1_Core clinical variables_03-09-19-withvalues.xlsx'
# df1_clinical = pd.read_excel(leap1_clin_path)
# df1_clinical.replace([999, 777, '999', '777', np.nan], pd.NA, inplace=True)
# df1_clinical['subjects'] = df1_clinical['subjects'].astype(int)
# df1_clinical = df1_clinical[df1_clinical['subjects'].isin(df_dtifit_wave1['subjects'])]
# variables_df1 = df1_clinical.columns.tolist()

# # Load clinical data wave 2
# leap2_clin_path = '/project_cephfs/3022035.06/LEAP_clinical/LEAP_t2_Core clinical variables_03-09-19-withvalues.xlsx'
# df2_clinical = pd.read_excel(leap2_clin_path)
# df2_clinical.replace([999, 777, '999', '777', np.nan], pd.NA, inplace=True)
# df2_clinical = df2_clinical[df2_clinical['t2_absence'].isna()]
# df2_clinical['subjects'] = df2_clinical['subjects'].astype(int)
# df2_clinical = df2_clinical[df2_clinical['subjects'].isin(df_dtifit_wave2['subjects'])]
# variables_df2 = df2_clinical.columns.tolist()

# # Load clinical data wave 3
# leap3_clin_path = '/home/preclineu/ramcir/Desktop/Clinical/data/LEAP_T3_CoreClinicalVariables_fixed.tsv'
# df3_clinical = pd.read_csv(leap3_clin_path, sep='\t')
# df3_clinical.replace([999, 777, '999', '777', np.nan], pd.NA, inplace=True)
# df3_clinical = df3_clinical[df3_clinical['t3_absence'].isna()]
# df3_clinical['subjects'] = df3_clinical['subjects'].astype(int)
# df3_clinical = df3_clinical[df3_clinical['subjects'].isin(df_dtifit_wave3['subjects'])]
# variables_df3 = df3_clinical.columns.tolist()

# # Join waves
# df12_clinical= df1_clinical.set_index("subjects").join(df2_clinical.set_index("subjects"), how="inner")
# df123_clinical= df12_clinical.join(df3_clinical.set_index("subjects"), how="inner")
# # Calculate time lapse between visits
# df123_clinical["t1_t2_lapse"] = df123_clinical["t2_ageyrs"] - df123_clinical["t1_ageyrs"]
# df123_clinical["t2_t3_lapse"] = df123_clinical["t3_ageyrs"] - df123_clinical["t2_ageyrs"]
# df123_clinical["t1_t3_lapse"] = df123_clinical["t3_ageyrs"] - df123_clinical["t1_ageyrs"]
# # Calculate the mean lapse for each time window
# mean_t1_t2_lapse = df123_clinical["t1_t2_lapse"].mean()
# mean_t2_t3_lapse = df123_clinical["t2_t3_lapse"].mean()
# mean_t1_t3_lapse = df123_clinical["t1_t3_lapse"].mean()
# # Display the results
# print(f"Mean T1-T2 Lapse: {mean_t1_t2_lapse:.2f} years")
# print(f"Mean T2-T3 Lapse: {mean_t2_t3_lapse:.2f} years")
# print(f"Mean T1-T3 Lapse: {mean_t1_t3_lapse:.2f} years")

In [26]:
# File paths for clinical data
clinical_paths = {
    "wave1": "/project_cephfs/3022035.06/LEAP_clinical/LEAP_t1_Core clinical variables_03-09-19-withvalues.xlsx",
    "wave2": "/project_cephfs/3022035.06/LEAP_clinical/LEAP_t2_Core clinical variables_03-09-19-withvalues.xlsx",
    "wave3": "/home/preclineu/ramcir/Desktop/Clinical/data/LEAP_T3_CoreClinicalVariables_fixed.tsv"
}

# Ensure DTI dataframes are loaded before running
dtifit_waves = {
    "wave1": df_dtifit_wave1,
    "wave2": df_dtifit_wave2,
    "wave3": df_dtifit_wave3
}

# Absence column names for filtering
absence_cols = {"wave1": "t1_absence", "wave2": "t2_absence", "wave3": "t3_absence"}

# Dictionary to store cleaned clinical data, variable names, participant counts, and absence reasons
clinical_data = {}
variables = {}
initial_participant_counts = {}
filtered_participant_counts = {}
absence_reasons = {}

In [26]:
# Load and process each wave
for wave, path in clinical_paths.items():
    # Load data (Excel for wave1 & wave2, TSV for wave3)
    df = pd.read_excel(path) if path.endswith(".xlsx") else pd.read_csv(path, sep="\t")

    # Replace invalid values with NA
    df.replace([999, 777, "999", "777", np.nan], pd.NA, inplace=True)

    # Store original participant count before filtering
    initial_participant_counts[wave] = len(df)

    # Retrieve and store absence reasons if applicable
    if absence_cols[wave] in df.columns:
        absence_counts = df[absence_cols[wave]].value_counts(dropna=True)
        absence_reasons[wave] = absence_counts.head(10).to_dict()  # Keep only top 10
        # Remove absent subjects
        df = df[df[absence_cols[wave]].isna()]
    else:
        absence_reasons[wave] = "No absence column for this wave"

    # Convert 'subjects' to int and filter by DTI presence
    df["subjects"] = df["subjects"].astype(int)
    df = df[df["subjects"].isin(dtifit_waves[wave]["subjects"])]

    # Store cleaned dataframe and variable names
    clinical_data[wave] = df
    variables[wave] = df.columns.tolist()

    # Store final participant count after filtering
    filtered_participant_counts[wave] = len(df)

In [26]:
# Join all waves on 'subjects'
df_clinical_combined = clinical_data["wave1"].set_index("subjects") \
    .join(clinical_data["wave2"].set_index("subjects"), how="inner") \
    .join(clinical_data["wave3"].set_index("subjects"), how="inner")

# Calculate time lapse between visits
df_clinical_combined["t1_t2_lapse"] = df_clinical_combined["t2_ageyrs"] - df_clinical_combined["t1_ageyrs"]
df_clinical_combined["t2_t3_lapse"] = df_clinical_combined["t3_ageyrs"] - df_clinical_combined["t2_ageyrs"]
df_clinical_combined["t1_t3_lapse"] = df_clinical_combined["t3_ageyrs"] - df_clinical_combined["t1_ageyrs"]

# Calculate mean lapse times
mean_lapses = {
    "Mean T1-T2 Lapse": df_clinical_combined["t1_t2_lapse"].mean(),
    "Mean T2-T3 Lapse": df_clinical_combined["t2_t3_lapse"].mean(),
    "Mean T1-T3 Lapse": df_clinical_combined["t1_t3_lapse"].mean()
}

In [26]:
# Print participant counts before and after filtering
print("\nInitial Participant Counts (Before Filtering for Absence & DTI):")
for wave, count in initial_participant_counts.items():
    print(f"Participants in {wave.capitalize()}: {count}")

print("\nFiltered Participant Counts (After Absence Removal & DTI Filtering):")
for wave, count in filtered_participant_counts.items():
    print(f"Participants in {wave.capitalize()}: {count}")

# Print top 10 absence reasons before filtering
print("\nTop 10 Reasons for Absence (before filtering):")
for wave, reasons in absence_reasons.items():
    print(f"\n{wave.capitalize()}:")
    if isinstance(reasons, dict):
        for reason, count in reasons.items():
            print(f"  {reason}: {count}")
    else:
        print(f"  {reasons}")

# Print mean lapse times
print("\nMean Lapse Times:")
for key, value in mean_lapses.items():
    print(f"{key}: {value:.2f} years")


Initial Participant Counts (Before Filtering for Absence & DTI):
Participants in Wave1: 763
Participants in Wave2: 763
Participants in Wave3: 309

Filtered Participant Counts (After Absence Removal & DTI Filtering):
Participants in Wave1: 351
Participants in Wave2: 213
Participants in Wave3: 153

Top 10 Reasons for Absence (before filtering):

Wave1:
  No absence column for this wave

Wave2:
  Not invited for T2: Rome participant: 42
  At BL: Late schedule D recruits: BL only: 30
  Unable to make contact with participant: 28
  Burden is too high: 22
  No specific reason given or recorded: 19
  Personal circumstances: 17
  Not liking (an aspect of) the study: 17
  Not invited back: 9
  Busy with job/school: 6
  At BL: Only some records (may be kept): 4

Wave3:
  11: 1
  10: 1

Mean Lapse Times:
Mean T1-T2 Lapse: 1.47 years
Mean T2-T3 Lapse: 5.71 years
Mean T1-T3 Lapse: 7.18 years


In [23]:
# # Remove prefixes (t1_, t2_, t3_) from variable names for comparison
# variables_df1_cleaned = {var.split('_', 1)[1] for var in variables_df1 if '_' in var}
# variables_df2_cleaned = {var.split('_', 1)[1] for var in variables_df2 if '_' in var}
# variables_df3_cleaned = {var.split('_', 1)[1] for var in variables_df3 if '_' in var}
# # Find common variables present in all three waves
# common_variables12 = variables_df1_cleaned & variables_df2_cleaned
# common_variables23 = variables_df2_cleaned & variables_df3_cleaned
# common_variables13 = variables_df1_cleaned & variables_df3_cleaned 
# common_variables123 = variables_df1_cleaned & variables_df2_cleaned & variables_df3_cleaned
# # Find unique variables in each wave by subtracting the common ones
# unique_variables1 = variables_df1_cleaned - (variables_df2_cleaned | variables_df3_cleaned)
# unique_variables2 = variables_df2_cleaned - (variables_df1_cleaned | variables_df3_cleaned)
# unique_variables3 = variables_df3_cleaned - (variables_df1_cleaned | variables_df2_cleaned)
# # Display counts of variables
# print(f"Number of overlapping variables W1-W2: {len(common_variables12)}")
# print(f"Number of overlapping variables W2-W3: {len(common_variables23)}")
# print(f"Number of overlapping variables W1-W3: {len(common_variables13)}")
# print(f"Number of overlapping variables in all three waves: {len(common_variables123)}")
# print(f"Number of unique variables in W1: {len(unique_variables1)}")
# print(f"Number of unique variables in W2: {len(unique_variables2)}")
# print(f"Number of unique variables in W3: {len(unique_variables3)}")

In [22]:
# Clean variable names by removing prefixes (t1_, t2_, t3_) for comparison
variables_cleaned = {
    wave: {var.split('_', 1)[1] for var in variables[wave] if '_' in var}
    for wave in variables
}

# Find common variables between pairs of waves
common_variables12 = variables_cleaned["wave1"] & variables_cleaned["wave2"]
common_variables23 = variables_cleaned["wave2"] & variables_cleaned["wave3"]
common_variables13 = variables_cleaned["wave1"] & variables_cleaned["wave3"]

# Find variables common across all three waves
common_variables123 = variables_cleaned["wave1"] & variables_cleaned["wave2"] & variables_cleaned["wave3"]

# Find unique variables in each wave
unique_variables1 = variables_cleaned["wave1"] - (variables_cleaned["wave2"] | variables_cleaned["wave3"])
unique_variables2 = variables_cleaned["wave2"] - (variables_cleaned["wave1"] | variables_cleaned["wave3"])
unique_variables3 = variables_cleaned["wave3"] - (variables_cleaned["wave1"] | variables_cleaned["wave2"])

# Store results in a structured DataFrame for better visualization
variable_summary = pd.DataFrame({
    "Comparison": ["W1-W2", "W2-W3", "W1-W3", "W1-W2-W3", "Unique W1", "Unique W2", "Unique W3"],
    "Variable Count": [
        len(common_variables12), len(common_variables23), len(common_variables13),
        len(common_variables123), len(unique_variables1), len(unique_variables2), len(unique_variables3)
    ],
    "Variables": [
        ", ".join(common_variables12), ", ".join(common_variables23), ", ".join(common_variables13),
        ", ".join(common_variables123), ", ".join(unique_variables1), ", ".join(unique_variables2),
        ", ".join(unique_variables3)
    ]})

# Display information
pd.set_option('display.max_colwidth', None)
styled_df = variable_summary.style.set_table_styles([{"selector": "td", "props": [("text-align", "left")]}])
display(styled_df)


Unnamed: 0,Comparison,Variable Count,Variables
0,W1-W2,58,"sdq_emotional_p, iqwechrawscores_vo_raw, iqwechrawscores_si_t, ssp_audfilt, iqwechrawscores_vo_t, rbs_total, iqwechrawscores_bd_t, ssp_tactile, site, vabsdscoresc_dss, drugclass_3, ssp_total, iqtype, adhd_hyperimpul_parent, iqwechrawscores_mr_t, iqwechrawscores_si_raw, vabsabcabc_standard, vabsdscoresd_dss, sdq_internalising_p, iqwechrawscores_pc_raw, css_total, drugclass_1, srs_tscore_self, ssp_hype, drugclass_2, srs_tscore, ssp_lowenergy, sdq_externalising_p, ssp_prorated, age, srs_tscore_combined, srs_rawscore_self, sdq_prosocial_p, sa_css, sdq_peer_p, sdq_total_difficulties_p, ageyrs, iqwechrawscores_mr_raw, sdq_impact_p, ssp_underrespo_mod, ssp_visualaudisens, rrb_css, med_use, ssp_taste, sdq_conduct_p, iqwechrawscores_pc_t, vabsdscoress_dss, srs_rawscore, iqwechrawscores_bd_raw, iqwechrawscores_vp_raw, ssp_underresp, srs_rawscore_combined, adhd_inattentiv_parent, sdq_hyperactivity_p, schedule_enrol, ssp_move, iqwechrawscores_vp_t, sex"
1,W2-W3,13,"ssp_underresp, css_total, ageyrs, ssp_audfilt, ssp_lowenergy, ssp_tactile, rbs_total, site, ssp_total, ssp_move, ssp_taste, sex, absence"
2,W1-W3,13,"ssp_underresp, css_total, ageyrs, ssp_audfilt, ssp_lowenergy, ssp_tactile, rbs_total, site, diagnosis, ssp_total, ssp_move, ssp_taste, sex"
3,W1-W2-W3,12,"ssp_underresp, css_total, ageyrs, ssp_audfilt, ssp_lowenergy, ssp_tactile, rbs_total, site, ssp_total, ssp_move, ssp_taste, sex"
4,Unique W1,50,"csbq_total_score, handedness, beck_dep_adultd_parent, beck_dep_youthcd, sa_css_all, aq_child_total, aq_adol_total, asbq_total_adult_parent, dawba_ext, nat_speaker, schedule_adj, group, fsiq, beck_anx_adultd_parent, aq_adult_total, piq, adhd_cat_parent, beck_dep_youthb_self_tscore, adi_rrb_total, asd_thresh, css_total_all, dawba_adhd, dawba_int, adi_communication_total, dawba_anx, ethnicity, adhd_inattentiv_self, beck_dep_adulta_self, asbq_total_adult_self, beck_anx_youthb_self_tscore, adi_social_total, css_total_imputed, beck_anx_youthb_self, viq, education_moth, adhd_hyperimpul_self, sa_css_imputed, beck_anx_youthcd_parent_t, rrb_css_all, rrb_css_imputed, dawba_behavdis, beck_anx_youthcd_parent, adhd_cat_combined, handedness_score, education_fath, beck_anx_adulta_self, adhd_cat_self, dawba_dep, beck_dep_youthb, beck_dep_youthcd_parent_t"
5,Unique W2,10,"cri_prorated, adhd_prorated, seq_hyperep, cri_ris, seq_hyposirs, cri_mbc, seq_total, drugclass_4, cri_total, drugclass_5"
6,Unique W3,31,"ssp_visualaudisense, ssp_move_prorated, rbs_sterotyped, ssp_lowenergy_prorated, srs_tscore_rrb, rbs_sameness, ssp_total_prorated, ssp_visualaudisense_prorated, gender, srs_rawscore_total_self, rbs_compulsivity, srs_rawscore_total, rbs_selfinjury, srs_rawscore_sci_self, ssp_audfilt_prorated, srs_tscore_total_self, ssp_tactile_prorated, srs_tscore_rrb_self, srs_rawscore_rrb_self, ssp_underresp_prorated, ados_module, rbs_rb, rbs_ritual, ssp_taste_prorated, srs_rawscore_sci, srs_tscore_total, css_rrb, srs_rawscore_rrb, css_sa, srs_tscore_sci, srs_tscore_sci_self"
