In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
import matplotlib.pyplot as plt
import seaborn as sns
import vars

In [4]:
data = pd.read_csv("./CLEAN_ABCD_5.1_panel_20240917.csv", low_memory=False, index_col=0)
sample = data

In [5]:
# Standardize puberty data
sample['puberty_k'] = sample[['female_puberty', 'male_puberty']].apply(lambda x: x.iloc[0] if pd.notna(x.iloc[0]) else x.iloc[1], axis=1)
tp = 2

In [None]:
# also need to decide on cv methods

# Within Categories (Full Sample)

In [6]:

for name, l in vars.within_categories:
  all_predictors = ["depress_D_p", "time"] + l
  all_x = sample[all_predictors]

  # Filter the data for the current time point (tp)
  t = all_x[all_x['time'] == tp]
  t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values
  
  # Impute missing values using the mean
  imputer = SimpleImputer(strategy='mean')
  imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

  # Standardize the data
  scaler = StandardScaler()
  imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

  # Prepare X and y for the model
  X = imputed_scaled.drop(columns=["depress_D_p", "time"])
  y = imputed_scaled["depress_D_p"]

  # Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
  model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
  model.fit(X, y)

  # print(name)
  # print(pd.Series(model.coef_, index=X.columns))

  vars.save_plot(model, name, X, y, tp)

# Across Categories (Full Sample)

In [7]:
all_predictors = ["depress_D_p", "time"] + vars.across_categories
all_x = sample[all_predictors]

# Filter the data for the current time point (tp)
t = all_x[all_x['time'] == tp]
t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

# Standardize the data
scaler = StandardScaler()
imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

# Prepare X and y for the model
X = imputed_scaled.drop(columns=["depress_D_p", "time"])
y = imputed_scaled["depress_D_p"]

# Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
model.fit(X, y)

vars.save_plot(model, "across categories", X, y, tp)

# Low ALE

In [20]:
low_ale_sample = sample[sample['low_ale_children_p']]
print(f"sample size: {low_ale_sample['subject'].nunique()}")

sample size: 2075


In [21]:
# within categories

for name, l in vars.within_categories:
  all_predictors = ["depress_D_p", "time"] + l
  all_x = low_ale_sample[all_predictors]

  # Filter the data for the current time point (tp)
  t = all_x[all_x['time'] == tp]
  t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values
  
  # Impute missing values using the mean
  imputer = SimpleImputer(strategy='mean')
  imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

  # Standardize the data
  scaler = StandardScaler()
  imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

  # Prepare X and y for the model
  X = imputed_scaled.drop(columns=["depress_D_p", "time"])
  y = imputed_scaled["depress_D_p"]

  # Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
  model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
  model.fit(X, y)

  # print(name)
  # print(pd.Series(model.coef_, index=X.columns))

  vars.save_plot(model, "low ale " + name, X, y, tp)

In [22]:
# across

all_predictors = ["depress_D_p", "time"] + vars.across_categories
all_x = low_ale_sample[all_predictors]

# Filter the data for the current time point (tp)
t = all_x[all_x['time'] == tp]
t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

# Standardize the data
scaler = StandardScaler()
imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

# Prepare X and y for the model
X = imputed_scaled.drop(columns=["depress_D_p", "time"])
y = imputed_scaled["depress_D_p"]

# Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
model.fit(X, y)

vars.save_plot(model, "low ale across categories", X, y, tp)

# High ALE

In [23]:
high_ale_sample = sample[sample['high_ale']]
print(f"sample size: {high_ale_sample['subject'].nunique()}")

sample size: 2882


In [24]:
# within categories

for name, l in vars.within_categories:
  all_predictors = ["depress_D_p", "time"] + l
  all_x = high_ale_sample[all_predictors]

  # Filter the data for the current time point (tp)
  t = all_x[all_x['time'] == tp]
  t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values
  
  # Impute missing values using the mean
  imputer = SimpleImputer(strategy='mean')
  imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

  # Standardize the data
  scaler = StandardScaler()
  imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

  # Prepare X and y for the model
  X = imputed_scaled.drop(columns=["depress_D_p", "time"])
  y = imputed_scaled["depress_D_p"]

  # Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
  model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
  model.fit(X, y)

  # print(name)
  # print(pd.Series(model.coef_, index=X.columns))

  vars.save_plot(model, "high ale " + name, X, y, tp)

In [25]:
# across

all_predictors = ["depress_D_p", "time"] + vars.across_categories
all_x = high_ale_sample[all_predictors]

# Filter the data for the current time point (tp)
t = all_x[all_x['time'] == tp]
t_available = t.dropna(axis=1, how='all')  # Keep columns with at least some non-NaN values

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')
imputed = pd.DataFrame(imputer.fit_transform(t_available), columns=t_available.columns)

# Standardize the data
scaler = StandardScaler()
imputed_scaled = pd.DataFrame(scaler.fit_transform(imputed), columns=imputed.columns)

# Prepare X and y for the model
X = imputed_scaled.drop(columns=["depress_D_p", "time"])
y = imputed_scaled["depress_D_p"]

# Fit LASSO model (ElasticNet with l1_ratio=1 corresponds to LASSO)
model = ElasticNetCV(l1_ratio=1, cv=15, random_state=0)
model.fit(X, y)

vars.save_plot(model, "high ale across categories", X, y, tp)