In [1]:
import pandas as pd

df = pd.read_csv('../data/mh_y_ksads_dep.csv')

In [2]:
ksads = df[df['eventname'] == '2_year_follow_up_y_arm_1']
unique_subject_ids = ksads['src_subject_id'].unique()
print(f"Number of unique subject IDs: {len(unique_subject_ids)}")

Number of unique subject IDs: 10814


In [None]:
print(f"Dataset shape: {ksads.shape}")
print(f"Total columns: {len(ksads.columns)}")

ksads_columns = [col for col in ksads.columns if col.startswith('ksads_')]
print(f"Number of KSADS columns: {len(ksads_columns)}")

# Core depression indicators (from screening items)
core_symptoms = ['ksads_dp_raw_95_t', 'ksads_dp_raw_98_t', 'ksads_dp_raw_101_t']

# First, let's check data availability for core symptoms
print(f"\nCore symptoms data availability:")
for item in core_symptoms:
    if item in ksads.columns:
        missing_pct = (ksads[item].isnull().sum() / len(ksads)) * 100
        valid_count = ksads[item].notna().sum()
        print(f"{item}: {valid_count} valid ({100-missing_pct:.1f}% complete)")

        # Show response distribution
        if valid_count > 0:
            responses = ksads[item].value_counts().sort_index()
            print(f"   Response distribution: {dict(responses)}")
    else:
        print(f"{item}: Not found in dataset")

# Fix: Create outcomes from the filtered ksads dataset, not the full df
print(f"\nCreating depression outcomes from filtered dataset...")

# Binary outcome: Any clinically significant depression symptoms (≥2 = subthreshold or above)
depression_outcome = (ksads[core_symptoms] >= 2).any(axis=1)

# Severity outcome: Total depression severity
depression_severity = ksads[core_symptoms].sum(axis=1)

# Check how many subjects have valid outcome data
has_any_core_data = ksads[core_symptoms].notna().any(axis=1)
print(f"Subjects with any core symptom data: {has_any_core_data.sum()}")

# Depression outcome statistics
depression_count = depression_outcome.sum()
total_with_data = has_any_core_data.sum()
print(f"\nDepression outcomes:")
print(f"Subjects with depression (≥2 on any core symptom): {depression_count}/{total_with_data} ({depression_count/total_with_data*100:.1f}%)")

print(f"\nDepression severity distribution:")
severity_dist = depression_severity[has_any_core_data].value_counts().sort_index()
for score, count in severity_dist.items():
    pct = (count / total_with_data) * 100
    print(f"   Score {score}: {count} subjects ({pct:.1f}%)")

Dataset shape: (10814, 93)
Total columns: 93
Number of KSADS columns: 91

Core symptoms data availability:
ksads_dp_raw_95_t: 10814 valid (100.0% complete)
   Response distribution: {0: np.int64(7281), 1: np.int64(2788), 2: np.int64(514), 3: np.int64(134), 4: np.int64(97)}
ksads_dp_raw_98_t: 10814 valid (100.0% complete)
   Response distribution: {0: np.int64(6178), 1: np.int64(3715), 2: np.int64(710), 3: np.int64(115), 4: np.int64(96)}
ksads_dp_raw_101_t: 10814 valid (100.0% complete)
   Response distribution: {0: np.int64(5253), 1: np.int64(3900), 2: np.int64(1214), 3: np.int64(247), 4: np.int64(200)}

Creating depression outcomes from filtered dataset...
Subjects with any core symptom data: 10814

Depression outcomes:
Subjects with depression (≥2 on any core symptom): 2352/10814 (21.7%)

Depression severity distribution:
   Score 0: 3494 subjects (32.3%)
   Score 1: 2527 subjects (23.4%)
   Score 2: 1962 subjects (18.1%)
   Score 3: 1280 subjects (11.8%)
   Score 4: 707 subjects (6.

In [4]:
# Create final clean dataset for ML
print("Creating final ML-ready dataset...")

# Add depression outcomes to your main dataset
ksads_ml = ksads.copy()
ksads_ml['depression_binary'] = depression_outcome
ksads_ml['depression_severity'] = depression_severity

# Keep essential columns for merging with sleep data
essential_cols = [
    'src_subject_id',           # For merging with sleep data                  # Demographics
    'depression_binary',        # Your main ML target
    'depression_severity'       # Alternative target
]

# Create clean ML dataset
ml_ready = ksads_ml[essential_cols].copy()

# Convert age from months to years for interpretability
if 'interview_age' in ml_ready.columns:
    ml_ready['age_years'] = ml_ready['interview_age'] / 12

print(f"Final ML dataset shape: {ml_ready.shape}")
print(f"Depression outcome: {ml_ready['depression_binary'].sum()} positive cases ({ml_ready['depression_binary'].mean()*100:.1f}%)")

# Quick summary
print(f"\nDataset ready for sleep data merge:")
print(f"- Unique subjects: {ml_ready['src_subject_id'].nunique()}")
print(f"- No missing depression outcomes: ✓")
print(f"- Balanced classes: 78.3% vs 21.7% ✓")

Creating final ML-ready dataset...
Final ML dataset shape: (10814, 3)
Depression outcome: 2352 positive cases (21.7%)

Dataset ready for sleep data merge:
- Unique subjects: 10814
- No missing depression outcomes: ✓
- Balanced classes: 78.3% vs 21.7% ✓


In [5]:
import pandas as pd

df = pd.read_csv('../data/nt_y_fitb_slp_d.csv')
sleep = df[df['eventname'] == '2_year_follow_up_y_arm_1']

print(f"Dataset shape: {sleep.shape}")
print(f"Number of unique subjects: {sleep['src_subject_id'].nunique()}")

# Focus on key sleep metrics you want to use
key_sleep_vars = [
    'src_subject_id',           # For merging
    'fit_ss_sleepperiod_minutes',  # Total sleep period
    'fit_ss_wake_minutes',         # Wake time during sleep
    'fit_ss_light_minutes',        # Light sleep
    'fit_ss_deep_minutes',         # Deep sleep
    'fit_ss_rem_minutes',          # REM sleep
    'fit_ss_wake_count'            # Number of awakenings
]

# Check data availability for key sleep variables
print(f"\nKey sleep variables data availability:")
for var in key_sleep_vars:
    if var in sleep.columns:
        missing_count = sleep[var].isnull().sum()
        missing_pct = (missing_count / len(sleep)) * 100
        valid_count = len(sleep) - missing_count
        print(f"{var}: {valid_count} valid ({100-missing_pct:.1f}% complete)")
    else:
        print(f"{var}: Not found in dataset")

# Look at subjects with complete sleep data
sleep_subset = sleep[key_sleep_vars].copy()
complete_sleep_data = sleep_subset.dropna()
print(f"\nSubjects with complete key sleep data: {len(complete_sleep_data)}")

# Basic statistics for key sleep metrics
if len(complete_sleep_data) > 0:
    print(f"\nSleep metrics summary (complete cases only):")
    sleep_metrics = ['fit_ss_sleepperiod_minutes', 'fit_ss_wake_minutes',
                    'fit_ss_light_minutes', 'fit_ss_deep_minutes', 'fit_ss_rem_minutes']

    for metric in sleep_metrics:
        if metric in complete_sleep_data.columns:
            values = complete_sleep_data[metric]
            print(f"{metric}:")
            print(f"  Mean: {values.mean():.1f} min, Std: {values.std():.1f} min")
            print(f"  Range: {values.min():.1f} - {values.max():.1f} min")

Dataset shape: (83194, 22)
Number of unique subjects: 6078

Key sleep variables data availability:
src_subject_id: 83194 valid (100.0% complete)
fit_ss_sleepperiod_minutes: 83194 valid (100.0% complete)
fit_ss_wake_minutes: 83194 valid (100.0% complete)
fit_ss_light_minutes: 83194 valid (100.0% complete)
fit_ss_deep_minutes: 83194 valid (100.0% complete)
fit_ss_rem_minutes: 83194 valid (100.0% complete)
fit_ss_wake_count: 83194 valid (100.0% complete)

Subjects with complete key sleep data: 83194

Sleep metrics summary (complete cases only):
fit_ss_sleepperiod_minutes:
  Mean: 449.1 min, Std: 82.6 min
  Range: 0.5 - 4018.0 min
fit_ss_wake_minutes:
  Mean: 57.7 min, Std: 19.0 min
  Range: 0.0 - 701.5 min
fit_ss_light_minutes:
  Mean: 261.1 min, Std: 58.6 min
  Range: 0.5 - 2657.5 min
fit_ss_deep_minutes:
  Mean: 89.8 min, Std: 27.0 min
  Range: 0.0 - 572.0 min
fit_ss_rem_minutes:
  Mean: 98.3 min, Std: 32.4 min
  Range: 0.0 - 1025.5 min


In [6]:
print("Creating subject-level sleep features...")

# Group by subject and create aggregated sleep features
sleep_features = sleep.groupby('src_subject_id').agg({
    # Basic sleep metrics - means
    'fit_ss_sleepperiod_minutes': ['mean', 'std', 'count'],
    'fit_ss_wake_minutes': ['mean', 'std'],
    'fit_ss_light_minutes': ['mean', 'std'],
    'fit_ss_deep_minutes': ['mean', 'std'],
    'fit_ss_rem_minutes': ['mean', 'std'],
    'fit_ss_wake_count': ['mean', 'std']
}).round(2)

# Flatten column names
sleep_features.columns = [f"{col[0]}_{col[1]}" for col in sleep_features.columns]

# Reset index to make src_subject_id a column
sleep_features = sleep_features.reset_index()

print(f"Sleep features dataset shape: {sleep_features.shape}")
print(f"Unique subjects with sleep data: {len(sleep_features)}")

# Create some derived features
sleep_features['total_sleep_mean'] = (sleep_features['fit_ss_light_minutes_mean'] +
                                     sleep_features['fit_ss_deep_minutes_mean'] +
                                     sleep_features['fit_ss_rem_minutes_mean'])

sleep_features['sleep_efficiency_mean'] = (sleep_features['total_sleep_mean'] /
                                          sleep_features['fit_ss_sleepperiod_minutes_mean'] * 100)

sleep_features['deep_sleep_pct'] = (sleep_features['fit_ss_deep_minutes_mean'] /
                                   sleep_features['total_sleep_mean'] * 100)

sleep_features['rem_sleep_pct'] = (sleep_features['fit_ss_rem_minutes_mean'] /
                                  sleep_features['total_sleep_mean'] * 100)

print(f"\nKey derived features:")
print(f"Average total sleep: {sleep_features['total_sleep_mean'].mean():.1f} min")
print(f"Average sleep efficiency: {sleep_features['sleep_efficiency_mean'].mean():.1f}%")
print(f"Average deep sleep %: {sleep_features['deep_sleep_pct'].mean():.1f}%")
print(f"Average REM sleep %: {sleep_features['rem_sleep_pct'].mean():.1f}%")

# Show first few subjects
print(f"\nFirst 3 subjects' sleep features:")
display_cols = ['src_subject_id', 'fit_ss_sleepperiod_minutes_count', 'total_sleep_mean',
                'sleep_efficiency_mean', 'fit_ss_wake_minutes_mean']
print(sleep_features[display_cols].head(3))

Creating subject-level sleep features...
Sleep features dataset shape: (6078, 14)
Unique subjects with sleep data: 6078

Key derived features:
Average total sleep: 443.7 min
Average sleep efficiency: 100.0%
Average deep sleep %: 20.0%
Average REM sleep %: 21.8%

First 3 subjects' sleep features:
     src_subject_id  fit_ss_sleepperiod_minutes_count  total_sleep_mean  \
0  NDAR_INV00HEV6HB                                14            450.47   
1  NDAR_INV00LH735Y                                 3            487.33   
2  NDAR_INV00LJVZK2                                 3            472.99   

   sleep_efficiency_mean  fit_ss_wake_minutes_mean  
0             100.002220                     66.32  
1             100.000000                     80.17  
2              99.997886                     59.67  


In [9]:
# Merge depression outcomes with sleep features
print("Merging depression and sleep data...")

# Your depression data (from earlier)
depression_data = ksads_ml[['src_subject_id', 'depression_binary', 'depression_severity']].copy()

# Merge with sleep features
ml_dataset = depression_data.merge(sleep_features, on='src_subject_id', how='inner')

print(f"After merging:")
print(f"  Depression data: {len(depression_data)} subjects")
print(f"  Sleep data: {len(sleep_features)} subjects")
print(f"  Merged dataset: {len(ml_dataset)} subjects")
print(f"  Depression rate in merged data: {ml_dataset['depression_binary'].mean()*100:.1f}%")

Merging depression and sleep data...
After merging:
  Depression data: 10814 subjects
  Sleep data: 6078 subjects
  Merged dataset: 6010 subjects
  Depression rate in merged data: 19.5%


In [None]:
# Select key sleep features for your model
sleep_feature_cols = [
    'total_sleep_mean',
    'sleep_efficiency_mean',
    'fit_ss_wake_minutes_mean',
    'fit_ss_wake_minutes_std',
    'fit_ss_deep_minutes_mean',
    'fit_ss_rem_minutes_mean',
    'deep_sleep_pct',
    'rem_sleep_pct',
    'fit_ss_wake_count_mean',
    'fit_ss_sleepperiod_minutes_std'  # Sleep variability
]

# Check for any missing values
print("Missing values in sleep features:")
for col in sleep_feature_cols:
    if col in ml_dataset.columns:
        missing = ml_dataset[col].isnull().sum()
        print(f"  {col}: {missing} missing")

# Create final feature matrix
X = ml_dataset[sleep_feature_cols].copy()
y = ml_dataset['depression_binary'].copy()

print(f"\nFinal ML dataset:")
print(f"  Features (X): {X.shape}")
print(f"  Target (y): {y.shape}")
print(f"  Depression rate: {y.mean()*100:.1f}%")

Missing values in sleep features:
  total_sleep_mean: 0 missing
  sleep_efficiency_mean: 0 missing
  fit_ss_wake_minutes_mean: 0 missing
  fit_ss_wake_minutes_std: 209 missing
  fit_ss_deep_minutes_mean: 0 missing
  fit_ss_rem_minutes_mean: 0 missing
  deep_sleep_pct: 0 missing
  rem_sleep_pct: 0 missing
  fit_ss_wake_count_mean: 0 missing
  fit_ss_sleepperiod_minutes_std: 209 missing

Final ML dataset:
  Features (X): (6010, 10)
  Target (y): (6010,)
  Depression rate: 19.5%
