In [32]:
from sklearn.linear_model import Ridge
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# Ensemble model

Set data input folder

In [33]:
data_input_folder = '../data'

Import actual labels

In [34]:
y_train      = pd.read_excel(f"{data_input_folder}/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")
y_train.sort_values(by='participant_id', inplace=True)
y_train_sex  = y_train['Sex_F'].values.flatten()
y_train_adhd = y_train['ADHD_Outcome'].values.flatten()

Import prediction probabilities on the training data (with k-fold cross-validation)

In [35]:
# Read in probablities of predictions on the training data
catboost_pred_train = pd.read_csv(f"{data_input_folder}/Interim/CatBoost_train_pred_prob.csv")
lgbm_pred_train     = pd.read_csv(f"{data_input_folder}/Interim/LGBM_train_pred_prob.csv")
gnn_pred_train_sex  = pd.read_csv(f"{data_input_folder}/Interim/gnn_train_pred_prob_sex.csv")
gnn_pred_train_adhd = pd.read_csv(f"{data_input_folder}/Interim/gnn_train_pred_prob_adhd.csv")
cnn_pred_train_sex  = pd.read_csv(f"{data_input_folder}/Interim/cnn_train_pred_prob_sex.csv")
cnn_pred_train_adhd = pd.read_csv(f"{data_input_folder}/Interim/cnn_train_pred_prob_adhd.csv")

# Sort by participant id 
catboost_pred_train.sort_values(by='participant_id', inplace=True)
lgbm_pred_train.sort_values(by='participant_id', inplace=True)
gnn_pred_train_sex.sort_values(by='participant_id', inplace=True)
gnn_pred_train_adhd.sort_values(by='participant_id', inplace=True)
cnn_pred_train_sex.sort_values(by='participant_id', inplace=True)
cnn_pred_train_adhd.sort_values(by='participant_id', inplace=True)

Import test data

In [36]:
# Read test predictions (matches structure of train predictions)
catboost_test = pd.read_csv(f"{data_input_folder}/Interim/CatBoost_test_pred_prob.csv")
lgbm_test     = pd.read_csv(f"{data_input_folder}/Interim/LGBM_test_pred_prob.csv")
gnn_test_sex  = pd.read_csv(f"{data_input_folder}/Interim/gnn_test_pred_prob_sex.csv")
cnn_test_sex  = pd.read_csv(f"{data_input_folder}/Interim/cnn_test_pred_prob_sex.csv")
gnn_test_adhd = pd.read_csv(f"{data_input_folder}/Interim/gnn_test_pred_prob_adhd.csv")
cnn_test_adhd = pd.read_csv(f"{data_input_folder}/Interim/cnn_test_pred_prob_adhd.csv")

Include probabilities and predictions in dataframes

In [37]:
# Sex
y_probs_sex             = pd.DataFrame(columns = ['catboost', 'lgbm', 'gnn', 'cnn'])
y_preds_sex             = pd.DataFrame(columns = ['catboost', 'lgbm', 'gnn', 'cnn'])


y_probs_sex['catboost'] = catboost_pred_train['prob_sex']
y_probs_sex['lgbm']     = lgbm_pred_train['prob_sex']
y_probs_sex['gnn']      = gnn_pred_train_sex['probability']
y_probs_sex['cnn']      = cnn_pred_train_sex ['probability']

y_preds_sex['catboost'] = catboost_pred_train['pred_sex']
y_preds_sex['lgbm']     = lgbm_pred_train['pred_sex']
y_preds_sex['gnn']      = gnn_pred_train_sex['predicted_label']
y_preds_sex['cnn']      = cnn_pred_train_sex ['predicted_label']


# ADHD
y_probs_adhd             = pd.DataFrame(columns = ['catboost', 'lgbm', 'gnn', 'cnn'])
y_preds_adhd             = pd.DataFrame(columns = ['catboost', 'lgbm', 'gnn', 'cnn'])                                       
                                        
y_probs_adhd['catboost'] = catboost_pred_train['prob_adhd']
y_probs_adhd['lgbm']     = lgbm_pred_train['prob_adhd']
y_probs_adhd['gnn']      = gnn_pred_train_adhd['probability']
y_probs_adhd['cnn']      = cnn_pred_train_adhd['probability']

y_preds_adhd['catboost'] = catboost_pred_train['pred_adhd']
y_preds_adhd['lgbm']     = lgbm_pred_train['pred_adhd']
y_preds_adhd['gnn']      = gnn_pred_train_adhd['predicted_label']
y_preds_adhd['cnn']      = cnn_pred_train_adhd ['predicted_label']


Convert DataFrame to NumPy array (optional, Ridge handles DataFrames too)

In [38]:
# Sex
X_meta_sex = y_probs_sex.values 

# ADHD
X_meta_adhd = y_probs_adhd.values 

Fit Ridge ensemble model (positive weights only to avoid negative contributions)

In [39]:
# Sex
ridge_sex  = Ridge(positive=True)
ridge_adhd = Ridge(positive=True)


# ADHD
ridge_sex.fit(X_meta_sex, y_train_sex)
ridge_adhd.fit(X_meta_adhd, y_train_adhd)

Get predictions (probabilities)

In [40]:
# Sex
y_meta_prob_sex = ridge_sex.predict(X_meta_sex)

# ADHD
y_meta_prob_adhd = ridge_adhd.predict(X_meta_adhd)

Get predictions from probabilities

In [41]:
threshold = 0.5

# Sex
y_meta_pred_binary_sex = (y_meta_prob_sex > threshold).astype(int)

# ADHD
y_meta_pred_binary_adhd = (y_meta_prob_adhd > threshold).astype(int)


Evaluate F1 score on the training data

In [42]:
# Sex 
f1_catboost_sex = f1_score(y_train_sex, y_preds_sex['catboost'].values)
f1_lgbm_sex = f1_score(y_train_sex, y_preds_sex['lgbm'].values)
f1_gnn_sex = f1_score(y_train_sex, y_preds_sex['gnn'].values)
f1_cnn_sex = f1_score(y_train_sex, y_preds_sex['cnn'].values)
print('F1 scores of the original Catboost, LGBM, GNN and CNN models: ', f1_catboost_sex, ', ', f1_lgbm_sex, '', f1_gnn_sex, ' and ', f1_cnn_sex)
f1_ensemble_sex = f1_score(y_train_sex, y_meta_pred_binary_sex)
print("F1 Score of ensemble Sex prediction model (on training data):", f1_ensemble_sex)
print("Ridge Coefficients:", ridge_sex.coef_)

# ADHD
f1_catboost_adhd = f1_score(y_train_adhd, y_preds_adhd['catboost'].values)
f1_lgbm_adhd = f1_score(y_train_adhd, y_preds_adhd['lgbm'].values)
f1_gnn_adhd = f1_score(y_train_adhd, y_preds_adhd['gnn'].values)
f1_cnn_adhd = f1_score(y_train_adhd, y_preds_adhd['cnn'].values)
print('F1 scores of the original Catboost, LGBM, GNN and CNN models: ', f1_catboost_adhd, ', ', f1_lgbm_adhd, ', ', f1_gnn_adhd, 'and ', f1_cnn_adhd)
f1_ensemble_adhd = f1_score(y_train_adhd, y_meta_pred_binary_adhd)
print("F1 Score of ensemble ADHD prediction model (on training data):", f1_ensemble_adhd)
print("Ridge ADHD Coefficients:", ridge_adhd.coef_)

F1 scores of the original Catboost, LGBM, GNN and CNN models:  0.6061224489795919 ,  0.7822410147991543  0.8005115089514067  and  0.9207100591715977
F1 Score of ensemble Sex prediction model (on training data): 0.9296116504854369
Ridge Coefficients: [6.57448739e-04 4.54829851e-01 2.85436601e-01 6.77777004e-01]
F1 scores of the original Catboost, LGBM, GNN and CNN models:  0.975609756097561 ,  0.9057928613224108 ,  0.8688801350590883 and  0.9473081328751431
F1 Score of ensemble ADHD prediction model (on training data): 0.9768270944741533
Ridge ADHD Coefficients: [0.75551886 0.02064241 0.17907582 0.28510844]


Prediction for the test data

In [43]:
# Sex
X_meta_test_sex = pd.DataFrame({
    'catboost': catboost_test['prob_sex'],
    'lgbm':     lgbm_test['prob_sex'],
    'gnn':      gnn_test_sex['probability'],
    'cnn':      cnn_test_sex['probability']
}).values

# ADHD
X_meta_test_adhd = pd.DataFrame({
    'catboost': catboost_test['prob_adhd'],
    'lgbm':     lgbm_test['prob_adhd'],
    'gnn':      gnn_test_adhd['probability'],
    'cnn':      cnn_test_adhd['probability']
}).values

# Predict ensemble probabilities
y_meta_prob_test_sex  = ridge_sex.predict(X_meta_test_sex)
y_meta_prob_test_adhd = ridge_adhd.predict(X_meta_test_adhd)

# Convert to binary predictions
threshold = 0.5
y_meta_pred_test_sex  = (y_meta_prob_test_sex > threshold).astype(int)
y_meta_pred_test_adhd = (y_meta_prob_test_adhd > threshold).astype(int)


Create submission file

In [44]:
# Assuming all prediction DataFrames were sorted by 'participant_id'
participant_ids = catboost_test['participant_id'].values

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'participant_id': participant_ids,
    'ADHD_Outcome': y_meta_pred_test_adhd,
    'Sex_F': y_meta_pred_test_sex
})

# Save to CSV
submission_df.to_csv(f"{data_input_folder}/Interim/submission_ensemble.csv", index=False)