In [1127]:
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Ensemble model

Set data input folder

In [1128]:
data_input_folder = '../data'

Import actual labels

In [1129]:
y_true      = pd.read_excel(f"{data_input_folder}/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")
y_true.sort_values(by='participant_id', inplace=True)
y_true_sex  = y_true['Sex_F'].values.flatten()
y_true_adhd = y_true['ADHD_Outcome'].values.flatten()

Import prediction probabilities on the training data (with k-fold cross-validation)

In [1130]:
catboost_pred_prob_train     = pd.read_csv(f"{data_input_folder}/Interim/CatBoost_thresh_0_3_0_7_(100)_feat_mod7b_train_pred_prob.csv")
lgbm_pred_prob_train         = pd.read_csv(f"{data_input_folder}/Interim/LGBM_thresh_0_30_0_70_(100)_feat_mod7a_train_pred_prob.csv")
logreg_pred_prob_train       = pd.read_csv(f"{data_input_folder}/Interim/LogReg_thresh_0_3_0_7_(100)_feat_mod7b_train_pred_prob.csv")
gnn_pred_prob_train_sex      = pd.read_csv(f"{data_input_folder}/Interim/gnn_train_pred_prob_sex.csv")
gnn_pred_prob_train_adhd     = pd.read_csv(f"{data_input_folder}/Interim/gnn_train_pred_prob_adhd.csv")
cnn_pred_prob_train_sex      = pd.read_csv(f"{data_input_folder}/Interim/cnn_train_pred_prob_sex.csv")
cnn_pred_prob_train_adhd     = pd.read_csv(f"{data_input_folder}/Interim/cnn_train_pred_prob_adhd - before opt.csv")

#print(cnn_pred_prob_train_sex.head())

Import test data

In [1131]:
# Read test predictions (matches structure of train predictions)
catboost_pred_prob_test      = pd.read_csv(f"{data_input_folder}/Interim/CatBoost_thresh_0_3_0_7_(100)_feat_mod7b_test_pred_prob.csv")
lgbm_pred_prob_test          = pd.read_csv(f"{data_input_folder}/Interim/LGBM_thresh_0_30_0_70_(100)_feat_mod7a_test_pred_prob.csv")
logreg_pred_prob_test        = pd.read_csv(f"{data_input_folder}/Interim/LogReg_thresh_0_3_0_7_(100)_feat_mod7b_test_pred_prob.csv")
gnn_pred_prob_test_sex       = pd.read_csv(f"{data_input_folder}/Interim/gnn_test_pred_prob_sex.csv")
cnn_pred_prob_test_sex       = pd.read_csv(f"{data_input_folder}/Interim/cnn_test_pred_prob_sex.csv")
gnn_pred_prob_test_adhd      = pd.read_csv(f"{data_input_folder}/Interim/gnn_test_pred_prob_adhd.csv")
cnn_pred_prob_test_adhd      = pd.read_csv(f"{data_input_folder}/Interim/cnn_test_pred_prob_adhd - before opt.csv")

# Set predictions to integers:
catboost_pred_prob_test['pred_sex']         = catboost_pred_prob_test['pred_sex'].astype(int)
catboost_pred_prob_test['pred_adhd']        = catboost_pred_prob_test['pred_adhd'].astype(int)
lgbm_pred_prob_test['pred_sex']             = lgbm_pred_prob_test['pred_sex'].astype(int)
lgbm_pred_prob_test['pred_adhd']            = lgbm_pred_prob_test['pred_adhd'].astype(int)
logreg_pred_prob_test['pred_sex']           = logreg_pred_prob_test['pred_sex'].astype(int) 
logreg_pred_prob_test['pred_adhd']          = logreg_pred_prob_test['pred_adhd'].astype(int) 
gnn_pred_prob_test_sex['predicted_label']   = gnn_pred_prob_test_sex['predicted_label'].astype(int)  
gnn_pred_prob_test_adhd['predicted_label']  = gnn_pred_prob_test_adhd['predicted_label'].astype(int)
cnn_pred_prob_test_sex['predicted_label']   = cnn_pred_prob_test_sex['predicted_label'].astype(int) 
cnn_pred_prob_test_adhd['predicted_label']  = cnn_pred_prob_test_adhd['predicted_label'].astype(int)

Include probabilities and predictions in dataframes

In [1132]:
# Sex
y_probs_train_sex             = pd.DataFrame(columns = ['catboost', 'lgbm', 'logreg', 'gnn', 'cnn'])
y_preds_train_sex             = pd.DataFrame(columns = ['catboost', 'lgbm', 'logreg', 'gnn', 'cnn'])

y_probs_train_sex['catboost'] = catboost_pred_prob_train['prob_sex']
y_probs_train_sex['lgbm']     = lgbm_pred_prob_train['prob_sex']
y_probs_train_sex['logreg']   = logreg_pred_prob_train['prob_sex']
y_probs_train_sex['gnn']      = gnn_pred_prob_train_sex['probability']
y_probs_train_sex['cnn']      = cnn_pred_prob_train_sex ['probability']

y_preds_train_sex['catboost'] = catboost_pred_prob_train['pred_sex'].astype(int)
y_preds_train_sex['lgbm']     = lgbm_pred_prob_train['pred_sex'].astype(int)
y_preds_train_sex['logreg']   = logreg_pred_prob_train['pred_sex'].astype(int)
y_preds_train_sex['gnn']      = gnn_pred_prob_train_sex['predicted_label'].astype(int)
y_preds_train_sex['cnn']      = cnn_pred_prob_train_sex ['predicted_label'].astype(int)


# ADHD
y_probs_train_adhd             = pd.DataFrame(columns = ['catboost', 'lgbm', 'logreg', 'gnn', 'cnn'])
y_preds_train_adhd             = pd.DataFrame(columns = ['catboost', 'lgbm', 'logreg', 'gnn', 'cnn'])                                       
                                        
y_probs_train_adhd['catboost'] = catboost_pred_prob_train['prob_adhd']
y_probs_train_adhd['lgbm']     = lgbm_pred_prob_train['prob_adhd']
y_probs_train_adhd['logreg']   = logreg_pred_prob_train['prob_adhd']
y_probs_train_adhd['gnn']      = gnn_pred_prob_train_adhd['probability']
y_probs_train_adhd['cnn']      = cnn_pred_prob_train_adhd['probability']

y_preds_train_adhd['catboost'] = catboost_pred_prob_train['pred_adhd'].astype(int)
y_preds_train_adhd['lgbm']     = lgbm_pred_prob_train['pred_adhd'].astype(int)
y_preds_train_adhd['logreg']   = logreg_pred_prob_train['pred_adhd'].astype(int)
y_preds_train_adhd['gnn']      = gnn_pred_prob_train_adhd['predicted_label'].astype(int)
y_preds_train_adhd['cnn']      = cnn_pred_prob_train_adhd ['predicted_label'].astype(int)


### Inspect training seet - actuals, predictions and probabilities

In [1133]:
print(y_true.head())

    participant_id  ADHD_Outcome  Sex_F
26    00aIpNTbG5uh             1      0
668   00fV0OyyoLfw             1      0
850   04X1eiS79T4B             0      1
972   05ocQutkURd6             0      1
703   06YUNBA9ZRLq             1      0


In [1134]:
# CatBoost, LGBM and Logistic regression
print('CatBoost - head: ', catboost_pred_prob_train.head())
print('LGBM - head: ', lgbm_pred_prob_train.head())
print('Logistic Regression - head: ',logreg_pred_prob_train.head())

CatBoost - head:     participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0              26         1  0.371678          1   0.945520
1             668         1  0.480290          1   0.947815
2             850         0  0.077468          1   0.941741
3             972         1  0.402840          0   0.358736
4             703         0  0.156785          1   0.960568
LGBM - head:     participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0              26         1  0.418269          1   0.908085
1             668         1  0.483076          1   0.951754
2             850         1  0.304343          1   0.931524
3             972         1  0.376036          0   0.286332
4             703         1  0.345151          1   0.924369
Logistic Regression - head:     participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0              26         0  0.233080          1   0.967997
1             668         0  0.277645          1   0.938851
2             850         1  0.459461  

In [1135]:
# CNN and GNN:
print('CNN - sex - head: ', cnn_pred_prob_train_sex.head())
print('CNN - adhd - head: ', cnn_pred_prob_train_adhd.head())
print('GNN - adhd - head: ', gnn_pred_prob_train_sex.head())
print('GNN - adhd - head: ', gnn_pred_prob_train_adhd.head())

CNN - sex - head:    participant_id  predicted_label  probability
0   00aIpNTbG5uh                0     0.005227
1   00fV0OyyoLfw                0     0.002063
2   04X1eiS79T4B                1     0.995286
3   05ocQutkURd6                1     0.999951
4   06YUNBA9ZRLq                0     0.004608
CNN - adhd - head:    participant_id  predicted_label  probability
0   00aIpNTbG5uh                1     0.999836
1   00fV0OyyoLfw                1     0.999314
2   04X1eiS79T4B                0     0.024080
3   05ocQutkURd6                0     0.036821
4   06YUNBA9ZRLq                1     0.998820
GNN - adhd - head:    participant_id  predicted_label  probability
0   00aIpNTbG5uh              0.0     0.290706
1   00fV0OyyoLfw              0.0     0.005263
2   04X1eiS79T4B              1.0     0.555447
3   05ocQutkURd6              1.0     0.671185
4   06YUNBA9ZRLq              0.0     0.062872
GNN - adhd - head:    participant_id  predicted_label  probability
0   00aIpNTbG5uh            

In [1136]:
print('Sums Sex: ', y_preds_train_sex.sum())
print('Ratios Sex:', y_preds_train_sex.sum()/len(y_preds_train_sex), 2)
print(y_preds_train_sex.head())

Sums Sex:  catboost    582
lgbm        756
logreg      586
gnn         366
cnn         429
dtype: int64
Ratios Sex: catboost    0.479802
lgbm        0.623248
logreg      0.483100
gnn         0.301731
cnn         0.353669
dtype: float64 2
   catboost  lgbm  logreg  gnn  cnn
0         1     1       0    0    0
1         1     1       0    0    0
2         0     1       1    1    1
3         1     1       1    1    1
4         0     1       1    0    0


In [1137]:
# Probabilities - Sex
print('Averages Sex:', y_probs_train_sex.sum()/len(y_probs_train_sex), 2)
print(y_probs_train_sex.head())

Averages Sex: catboost    0.322381
lgbm        0.339087
logreg      0.344791
gnn         0.334102
cnn         0.337501
dtype: float64 2
   catboost      lgbm    logreg       gnn       cnn
0  0.371678  0.418269  0.233080  0.290706  0.005227
1  0.480290  0.483076  0.277645  0.005263  0.002063
2  0.077468  0.304343  0.459461  0.555447  0.995286
3  0.402840  0.376036  0.404952  0.671185  0.999951
4  0.156785  0.345151  0.428437  0.062872  0.004608


In [1138]:
print('Sums ADHD: ', y_preds_train_adhd.sum())
print('Ratios ADHD:', y_preds_train_adhd.sum()/len(y_preds_train_adhd), 2)
print(y_preds_train_adhd.head())

Sums ADHD:  catboost    937
lgbm        937
logreg      908
gnn         946
cnn         915
dtype: int64
Ratios ADHD: catboost    0.772465
lgbm        0.772465
logreg      0.748557
gnn         0.779885
cnn         0.754328
dtype: float64 2
   catboost  lgbm  logreg  gnn  cnn
0         1     1       1    1    1
1         1     1       1    1    1
2         1     1       1    0    0
3         0     0       0    0    0
4         1     1       1    1    1


In [1139]:
# Probabilities - ADHD
print('Averages ADHD:', y_probs_train_adhd.sum()/len(y_probs_train_adhd), 2)
print(y_probs_train_adhd.head())

Averages ADHD: catboost    0.705087
lgbm        0.689378
logreg      0.685702
gnn         0.708630
cnn         0.744217
dtype: float64 2
   catboost      lgbm    logreg       gnn       cnn
0  0.945520  0.908085  0.967997  0.729686  0.999836
1  0.947815  0.951754  0.938851  0.931447  0.999314
2  0.941741  0.931524  0.967378  0.431156  0.024080
3  0.358736  0.286332  0.291292  0.300768  0.036821
4  0.960568  0.924369  0.937478  0.864426  0.998820


### Inspect test dataset - predictions and liabilities (by model)

In [1140]:
# CatBoost, LGBM and Logistic regression - heads 
print('CatBoost - head: ', catboost_pred_prob_test.head())
print('LGBM - head: ', lgbm_pred_prob_test.head())
print('Logistic Regression - head: ',logreg_pred_prob_test.head())

CatBoost - head:    participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0   Cfwaf5FX7jWK         0  0.217678          1   0.816938
1   vhGrzmvA3Hjq         1  0.832338          1   0.711085
2   ULliyEXjy4OV         1  0.766293          1   0.676594
3   LZfeAb1xMtql         0  0.291242          1   0.949200
4   EnFOUv0YK1RG         0  0.084076          1   0.939391
LGBM - head:    participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0   Cfwaf5FX7jWK         0  0.220663          1   0.738505
1   vhGrzmvA3Hjq         1  0.594839          1   0.771759
2   ULliyEXjy4OV         1  0.434987          1   0.686023
3   LZfeAb1xMtql         1  0.303908          1   0.902864
4   EnFOUv0YK1RG         0  0.274619          1   0.940999
Logistic Regression - head:    participant_id  pred_sex  prob_sex  pred_adhd  prob_adhd
0   Cfwaf5FX7jWK         0  0.221684          1   0.799824
1   vhGrzmvA3Hjq         1  0.752063          1   0.666145
2   ULliyEXjy4OV         1  0.423283          1   0.62

In [1141]:
# CatBoost, LGBM and Logistic Regression - stats:
print('CatBoost - stats: ', catboost_pred_prob_test.describe())
print('LGBM - stats: ', lgbm_pred_prob_test.describe())
print('Logistic Regression - stats: ', logreg_pred_prob_test.describe())

CatBoost - stats:           pred_sex    prob_sex   pred_adhd   prob_adhd
count  304.000000  304.000000  304.000000  304.000000
mean     0.536184    0.353699    0.805921    0.716176
std      0.499511    0.209396    0.396142    0.252279
min      0.000000    0.032280    0.000000    0.037443
25%      0.000000    0.180953    1.000000    0.633322
50%      1.000000    0.326111    1.000000    0.822441
75%      1.000000    0.471892    1.000000    0.899807
max      1.000000    0.927453    1.000000    0.968338
LGBM - stats:           pred_sex    prob_sex   pred_adhd   prob_adhd
count  304.000000  304.000000  304.000000  304.000000
mean     0.677632    0.356538    0.819079    0.688543
std      0.468154    0.111389    0.385587    0.238114
min      0.000000    0.135740    0.000000    0.094530
25%      0.000000    0.269061    1.000000    0.584264
50%      1.000000    0.352041    1.000000    0.755004
75%      1.000000    0.429743    1.000000    0.869885
max      1.000000    0.626213    1.000000    0.9

In [1142]:
# CNN and GNN - heads
print('CNN - sex - head: ', cnn_pred_prob_test_sex.head())
print('CNN - adhd - head: ', cnn_pred_prob_test_adhd.head())
print('GNN - adhd - head: ', gnn_pred_prob_test_sex.head())
print('GNN - adhd - head: ', gnn_pred_prob_train_adhd.head())

CNN - sex - head:    participant_id  predicted_label  probability
0   Cfwaf5FX7jWK                0     0.025782
1   vhGrzmvA3Hjq                0     0.016927
2   ULliyEXjy4OV                1     0.584016
3   LZfeAb1xMtql                1     0.501813
4   EnFOUv0YK1RG                0     0.004733
CNN - adhd - head:    participant_id  predicted_label  probability
0   Cfwaf5FX7jWK                1     0.994249
1   vhGrzmvA3Hjq                1     0.969364
2   ULliyEXjy4OV                1     0.919596
3   LZfeAb1xMtql                1     0.963592
4   EnFOUv0YK1RG                1     0.997853
GNN - adhd - head:    participant_id  predicted_label  probability
0   Cfwaf5FX7jWK                0     0.298635
1   vhGrzmvA3Hjq                0     0.114017
2   ULliyEXjy4OV                0     0.060964
3   LZfeAb1xMtql                0     0.388152
4   EnFOUv0YK1RG                0     0.148169
GNN - adhd - head:    participant_id  predicted_label  probability
0   00aIpNTbG5uh            

In [1143]:
# CNN and GNN - stats
print('CNN - sex - stats: ', cnn_pred_prob_test_sex.describe())
print('CNN - adhd - stats: ', cnn_pred_prob_test_adhd.describe())
print('GNN - sex - stats: ', gnn_pred_prob_test_sex.describe())
print('GNN - adhd - stats: ', gnn_pred_prob_train_adhd.describe())

CNN - sex - stats:         predicted_label  probability
count       304.000000   304.000000
mean          0.296053     0.207823
std           0.457267     0.330410
min           0.000000     0.000445
25%           0.000000     0.005346
50%           0.000000     0.019836
75%           1.000000     0.233734
max           1.000000     0.998261
CNN - adhd - stats:         predicted_label  probability
count       304.000000   304.000000
mean          0.868421     0.843506
std           0.338590     0.276454
min           0.000000     0.023616
25%           1.000000     0.860555
50%           1.000000     0.982372
75%           1.000000     0.996108
max           1.000000     0.999793
GNN - sex - stats:         predicted_label  probability
count       304.000000   304.000000
mean          0.220395     0.289679
std           0.415196     0.254193
min           0.000000     0.000037
25%           0.000000     0.074951
50%           0.000000     0.210772
75%           0.000000     0.456233
max

## Submissions - preparations

Prepare dataframe for test data for submissions

In [1144]:
# Sex
X_meta_test_sex = pd.DataFrame({
    'catboost': catboost_pred_prob_test['prob_sex'],
    'lgbm':     lgbm_pred_prob_test['prob_sex'],
    'logreg':   logreg_pred_prob_test['prob_sex'],
    'gnn':      gnn_pred_prob_test_sex['probability'],
    'cnn':      cnn_pred_prob_test_sex['probability']
}).values

# ADHD
X_meta_test_adhd = pd.DataFrame({
    'catboost': catboost_pred_prob_test['prob_adhd'],
    'lgbm':     lgbm_pred_prob_test['prob_adhd'],
    'logreg':   logreg_pred_prob_test['prob_adhd'],
    'gnn':      gnn_pred_prob_test_adhd['probability'],
    'cnn':      cnn_pred_prob_test_adhd['probability']
}).values

Convert DataFrame to NumPy array (optional, Ridge handles DataFrames too)

In [1145]:
# Sex
X_meta_train_sex = y_probs_train_sex.values 

# ADHD
X_meta_train_adhd = y_probs_train_adhd.values 

Define metric (weighted F1 score for ADHD prediction as per competition rules)

In [1146]:
def get_weighted_f1(y_true, y_pred, sex, female_weight = 2.0, male_weight = 1.0):

    sample_weight = np.where(sex == 1, female_weight, male_weight)
    return f1_score(y_true, y_pred, average='binary', sample_weight=sample_weight)

Select best threshold (for Submission 3)

In [1147]:
def find_best_threshold(y_true, y_probs, target, y_true_sex=None, 
                        threshold_range = np.linspace(0, 1, 101)):
    
    thresholds = threshold_range
    metrics = []

    for t in thresholds:
        y_pred = (y_probs > t).astype(int)
        metric = (
            f1_score(y_true, y_pred) if target == 'sex'
            else get_weighted_f1(y_true, y_pred, y_true_sex)
        )
        metrics.append(metric)

    best_idx = np.argmax(metrics)
    return thresholds[best_idx], metrics[best_idx]

Custom F1 score (for Submission 4)

In [1148]:
def find_best_threshold_with_constraint(y_true_train, y_probs_train, y_probs_test, 
                                        target, y_true_sex_train=None,
                                        threshold_range=np.linspace(0, 1, 101),
                                        female_bounds=(0.28, 0.38),
                                        adhd_bounds=(0.60, 0.85)):

    best_score = -np.inf
    best_threshold = None

    for t in threshold_range:
        y_pred_train = (y_probs_train > t).astype(int)
        y_pred_test = (y_probs_test > t).astype(int)

        test_pred_ratio = y_pred_test.mean()  # use TEST set predictions for constraint

        # Enforce constraint on TEST prediction ratios
        if target == 'sex':
            if not (female_bounds[0] <= test_pred_ratio <= female_bounds[1]):
                continue
            metric = f1_score(y_true_train, y_pred_train)
        elif target == 'adhd':
            if not (adhd_bounds[0] <= test_pred_ratio <= adhd_bounds[1]):
                continue
            metric = get_weighted_f1(y_true_train, y_pred_train, y_true_sex_train)
        else:
            raise ValueError("Invalid target. Must be 'sex' or 'adhd'.")

        if metric > best_score:
            best_score = metric
            best_threshold = t

    if best_threshold is None:
        raise ValueError("No threshold satisfies the prediction bounds constraint.")

    return best_threshold, best_score

### Submission 1 - Logistic regression (not optimised)

Convert DataFrame to NumPy array (optional, Ridge handles DataFrames too)

In [1149]:
# Define Logistic Regression Model
clf_sex  = LogisticRegression()
clf_adhd = LogisticRegression()

# Fit model
clf_sex.fit(X_meta_train_sex, y_true_sex)
clf_adhd.fit(X_meta_train_adhd, y_true_adhd)

# Calculate probabilities
y_meta_prob_train_sex = clf_sex.predict_proba(X_meta_train_sex)[:, 1]
y_meta_prob_train_adhd = clf_adhd.predict_proba(X_meta_train_adhd)[:, 1]

In [1150]:
# Set fixed threshold
fixed_threshold_sex  = 0.3
fixed_threshold_adhd = 0.5 

# Sex
y_meta_pred_train_sex = (y_meta_prob_train_sex > fixed_threshold_sex).astype(int)

# ADHD
y_meta_pred_train_adhd = (y_meta_prob_train_adhd > fixed_threshold_adhd).astype(int)

print('Nr of females predictions in the training dataset: ', y_meta_pred_train_sex.sum())
print('Nr of ADHD predictions in the training dataset: ', y_meta_pred_train_adhd.sum())
print('Proportion of females predictions in the training dataset: ', round(y_meta_pred_train_sex.sum()/ len(y_meta_pred_train_sex), 4))
print('Proportion of ADHD predictions in the training dataset: ', round(y_meta_pred_train_adhd.sum()/ len(y_meta_pred_train_adhd), 4))# Set fixed threshold

Nr of females predictions in the training dataset:  423
Nr of ADHD predictions in the training dataset:  863
Proportion of females predictions in the training dataset:  0.3487
Proportion of ADHD predictions in the training dataset:  0.7115


Evaluate F1 score on the training data

In [1151]:
# Sex 
f1_catboost_sex = round(f1_score(y_true_sex, y_preds_train_sex['catboost'].values),4)
f1_lgbm_sex = round(f1_score(y_true_sex, y_preds_train_sex['lgbm'].values),4)
f1_gnn_sex = round(f1_score(y_true_sex, y_preds_train_sex['gnn'].values),4)
f1_cnn_sex = round(f1_score(y_true_sex, y_preds_train_sex['cnn'].values),4)
print('F1 scores for Sex prediction of the original Catboost, LGBM, GNN and CNN models: ', f1_catboost_sex, ', ', f1_lgbm_sex, '', f1_gnn_sex, ' and ', f1_cnn_sex)
f1_ensemble_sex = round(f1_score(y_true_sex, y_meta_pred_train_sex),4)
print("F1 Score of ensemble Sex prediction model (on training data):", f1_ensemble_sex)

# ADHD
f1_catboost_adhd = round(get_weighted_f1(y_true_adhd, y_preds_train_adhd['catboost'].values, y_true_sex),4)
f1_lgbm_adhd = round(get_weighted_f1(y_true_adhd, y_preds_train_adhd['lgbm'].values, y_true_sex),4)
f1_gnn_adhd = round(get_weighted_f1(y_true_adhd, y_preds_train_adhd['gnn'].values,y_true_sex), 4)
f1_cnn_adhd = round(get_weighted_f1(y_true_adhd, y_preds_train_adhd['cnn'].values, y_true_sex),4)
print('F1 scores for ADHD prediction of the original Catboost, LGBM, GNN and CNN models: ', f1_catboost_adhd, ', ', f1_lgbm_adhd, ', ', f1_gnn_adhd, 'and ', f1_cnn_adhd)
f1_ensemble_adhd = round(get_weighted_f1(y_true_adhd, y_meta_pred_train_adhd, y_true_sex), 4)
print("F1 Score of ensemble ADHD prediction model (on training data):", f1_ensemble_adhd)
print("Overall F1 score of the ensemble model (on training data):", (f1_ensemble_sex + f1_ensemble_adhd)/2)

F1 scores for Sex prediction of the original Catboost, LGBM, GNN and CNN models:  0.491 ,  0.5307  0.8005  and  0.9207
F1 Score of ensemble Sex prediction model (on training data): 0.9297
F1 scores for ADHD prediction of the original Catboost, LGBM, GNN and CNN models:  0.857 ,  0.8583 ,  0.8601 and  0.9446
F1 Score of ensemble ADHD prediction model (on training data): 0.9628
Overall F1 score of the ensemble model (on training data): 0.94625


In [1152]:
# Predict ensemble probabilities
y_meta_prob_test_sex  = clf_sex.predict_proba(X_meta_test_sex)[:, 1]
y_meta_prob_test_adhd = clf_sex.predict_proba(X_meta_test_adhd)[:, 1]


# Convert to binary predictions
y_meta_pred_test_sex_logreg  = (y_meta_prob_test_sex > fixed_threshold_sex).astype(int)
y_meta_pred_test_adhd_logreg = (y_meta_prob_test_adhd > fixed_threshold_adhd).astype(int)

In [1153]:
# Assuming all prediction DataFrames were sorted by 'participant_id'
participant_ids = cnn_pred_prob_test_adhd['participant_id'].values

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'participant_id': participant_ids,
    'ADHD_Outcome': y_meta_pred_test_adhd_logreg,
    'Sex_F': y_meta_pred_test_sex_logreg
})

# Save to CSV
submission_df.to_csv(f"{data_input_folder}/Interim/submission_ensemble_logistic_regr_preopt.csv", index=False)

### Submission 2 - Ensemble with Logistic Regression (optimised test demographics)

Optimise threshold so that female and ADHD proportions in the test dataset match the training dataset

In [1154]:
# Select threshold to make sure proportion of females and males in the test dataset matches the 
# proportion in training dataset
best_threshold_sex = 0
best_threshold_adhd = 0
min_total_diff_sex = float('inf')
min_total_diff_adhd = float('inf')
threshold_range = np.arange(0, 1.01, 0.01)

target_nr_of_females = round(len(y_meta_prob_test_sex)* y_true_sex.sum()/len(y_true_sex))
target_nr_of_adhd = round(len(y_meta_prob_test_adhd)* y_true_adhd.sum()/len(y_true_adhd))
print('target nr fem ', target_nr_of_females)
print('target nr adhd ', target_nr_of_adhd)

for threshold in threshold_range:
    y_pred_sex = (y_meta_prob_test_sex > threshold).astype(int)
    
    diff_sex = abs(y_pred_sex.sum() - target_nr_of_females)
    
    if diff_sex < min_total_diff_sex:
        min_total_diff_sex      = diff_sex 
        best_threshold_sex      = threshold

for threshold in threshold_range:
    y_pred_adhd = (y_meta_prob_test_adhd > threshold).astype(int)
    
    diff_adhd = abs(y_pred_adhd.sum() - target_nr_of_adhd)
    
    if diff_adhd < min_total_diff_adhd:
        min_total_diff_adhd     = diff_adhd
        best_threshold_adhd     = threshold

# Optimal thresholds:
print(f"Optimal threshold sex: {best_threshold_sex:.2f}")
print(f"Optimal threshold adhd: {best_threshold_adhd:.2f}")
print(f"Minimum total difference sex: { min_total_diff_sex }")
print(f"Minimum total difference adhd: { min_total_diff_adhd }")

# Convert to binary predictions
y_meta_pred_test_sex_2  = (y_meta_prob_test_sex > best_threshold_sex).astype(int)
y_meta_pred_test_adhd_2 = (y_meta_prob_test_adhd > best_threshold_adhd).astype(int)

print('Number of females predictions in the test dataset: ', y_meta_pred_test_sex_2.sum())
print('Number of ADHD predictions in the test dataset: ', y_meta_pred_test_adhd_2.sum())
print('Proportion of females predictions in the test dataset: ', round(y_meta_pred_test_sex_2.sum()/ len(y_meta_pred_test_sex_2), 4))
print('Proportion of ADHD predictions in the test dataset: ', round(y_meta_pred_test_adhd_2.sum()/ len(y_meta_pred_test_adhd_2), 4))

# Update predictions on the training dataset:
y_meta_pred_train_sex_2  = (y_meta_prob_train_sex > best_threshold_sex).astype(int)
y_meta_pred_train_adhd_2 = (y_meta_prob_train_adhd > best_threshold_adhd).astype(int)

# Optimal F1 scores:
f1_ensemble_sex_2 = round(f1_score(y_true_sex, y_meta_pred_train_sex_2),4)
print("F1 Score of ensemble Sex prediction model (on training data):", f1_ensemble_sex_2)
f1_ensemble_adhd_2 = round(get_weighted_f1(y_true_adhd, y_meta_pred_train_adhd_2, y_true_sex), 4)
print("F1 Score of ensemble ADHD prediction model (on training data):", f1_ensemble_adhd_2)
print("Overall F1 score of the ensemble model (on training data):", (f1_ensemble_sex_2 + f1_ensemble_adhd_2)/2)



target nr fem  104
target nr adhd  208
Optimal threshold sex: 0.13
Optimal threshold adhd: 0.97
Minimum total difference sex: 0
Minimum total difference adhd: 12
Number of females predictions in the test dataset:  104
Number of ADHD predictions in the test dataset:  220
Proportion of females predictions in the test dataset:  0.3421
Proportion of ADHD predictions in the test dataset:  0.7237
F1 Score of ensemble Sex prediction model (on training data): 0.9074
F1 Score of ensemble ADHD prediction model (on training data): 0.6963
Overall F1 score of the ensemble model (on training data): 0.80185


Recalculate the F1 score with the optimal threshold:

Create submission file

In [1155]:
# Create the submission DataFrame
submission_df = pd.DataFrame({
    'participant_id': participant_ids,
    'ADHD_Outcome': y_meta_pred_test_adhd_2,
    'Sex_F': y_meta_pred_test_sex_2
})

# Save to CSV
submission_df.to_csv(f"{data_input_folder}/Interim/submission_ensemble_logistic_regr_opt_prop.csv", index=False)

### Submission 3 - Logistic regression (optimised for threshold)

Optimise threshold to reach maximum F1 score

In [1156]:
# Select threshold to make sure proportion of females and males in the test dataset matches the 
# proportion in training dataset
best_threshold_sex = 0
best_threshold_adhd = 0
min_f1_sex = float('inf')
min_f1_adhd = float('inf')
threshold_range = np.arange(0, 1.01, 0.01)


best_threshold_sex_3, best_f1_sex_3 = find_best_threshold(y_true_sex, y_meta_prob_train_sex , 'sex', 
                                                      y_true_sex=None, threshold_range = threshold_range)
best_threshold_adhd_3, best_f1_adhd_3 =find_best_threshold(y_true_adhd, y_meta_prob_train_adhd, 'adhd', 
                                                      y_true_sex=y_true_sex, threshold_range = threshold_range)

print(f"Optimal threshold sex: {best_threshold_sex_3:.2f}")
print(f"Optimal threshold adhd: {best_threshold_adhd_3:.2f}")

print(f"Optimal (weighted) F1 sex: {best_f1_sex_3:.4}")
print(f"Optimal (weighted) F1 ADHD: {best_f1_adhd_3:.4f}")
best_f1_overall_3 = (best_f1_sex_3 + best_f1_adhd_3)/2
print(f"Optimal overall F1: {best_f1_overall_3:.4f}")


# Convert to binary predictions
y_meta_pred_test_sex_3  = (y_meta_prob_test_sex > best_threshold_sex_3).astype(int)
y_meta_pred_test_adhd_3 = (y_meta_prob_test_adhd > best_threshold_adhd_3).astype(int)

print('Number of females predictions in the test dataset: ', y_meta_pred_test_sex_3.sum())
print('Number of ADHD predictions in the test dataset: ', y_meta_pred_test_adhd_3.sum())
print('Proportion of females predictions in the test dataset: ', round(y_meta_pred_test_sex_3.sum()/ len(y_meta_pred_test_sex_3), 4))
print('Proportion of ADHD predictions in the test dataset: ', round(y_meta_pred_test_adhd_3.sum()/ len(y_meta_pred_test_adhd_3), 4))

# Update predictions on the training dataset:
y_meta_pred_train_sex_3  = (y_meta_prob_train_sex > best_threshold_sex_3).astype(int)
y_meta_pred_train_adhd_3 = (y_meta_prob_train_adhd > best_threshold_adhd_3).astype(int)

Optimal threshold sex: 0.50
Optimal threshold adhd: 0.53
Optimal (weighted) F1 sex: 0.9356
Optimal (weighted) F1 ADHD: 0.9644
Optimal overall F1: 0.9500
Number of females predictions in the test dataset:  59
Number of ADHD predictions in the test dataset:  283
Proportion of females predictions in the test dataset:  0.1941
Proportion of ADHD predictions in the test dataset:  0.9309


Create submission file

In [1157]:
# Create the submission DataFrame
submission_df = pd.DataFrame({
    'participant_id': participant_ids,
    'ADHD_Outcome': y_meta_pred_test_adhd_3,
    'Sex_F': y_meta_pred_test_sex_3
})

# Save to CSV
submission_df.to_csv(f"{data_input_folder}/Interim/submission_ensemble_logistic_regr_opt_thres.csv", index=False)

### Submission 4 - Logistic regression (optimal F1 score with demographics constraints)

Optimise threshold to reach maximum F1 score with penalty

In [1158]:
# Select threshold to make sure proportion of females and males in the test dataset matches the 
# proportion in training dataset
best_threshold_sex = 0
best_threshold_adhd = 0
min_f1_sex = float('inf')
min_f1_adhd = float('inf')
threshold_range = np.arange(0, 1.01, 0.0001)
female_bonds = (0.78, 1.0)        # Proportion of females in the test set
adhd_bonds = (0.96, 1.0)           # Proportion of ADHD diagnosis in the test
 
best_threshold_sex_4, best_f1_sex_4   = find_best_threshold_with_constraint(y_true_sex, y_meta_prob_train_sex, y_meta_prob_test_sex, 'sex',                                           
                                                                            y_true_sex_train = None, threshold_range = threshold_range,
                                                                            female_bounds = female_bonds)
best_threshold_adhd_4, best_f1_adhd_4 = find_best_threshold_with_constraint(y_true_adhd, y_meta_prob_train_adhd, y_meta_prob_test_adhd, 'adhd', 
                                                                            y_true_sex_train = y_true_sex, threshold_range = threshold_range,
                                                                            adhd_bounds = adhd_bonds)

print(f"Optimal threshold sex: {best_threshold_sex_4:.2f}")
print(f"Optimal threshold adhd: {best_threshold_adhd_4:.2f}")

print(f"Optimal (weighted) F1 sex: {best_f1_sex_4:.4}")
print(f"Optimal (weighted) F1 ADHD: {best_f1_adhd_4:.4f}")
best_f1_overall_4 = (best_f1_sex_4 + best_f1_adhd_4)/2
print(f"Optimal overall F1: {best_f1_overall_4:.4f}")


# Convert to binary predictions
y_meta_pred_test_sex_4  = (y_meta_prob_test_sex > best_threshold_sex_4).astype(int)
y_meta_pred_test_adhd_4 = (y_meta_prob_test_adhd > best_threshold_adhd_4).astype(int)


print('Number of females predictions in the test dataset: ', y_meta_pred_test_sex_4.sum())
print('Number of ADHD predictions in the test dataset: ', y_meta_pred_test_adhd_4.sum())
print('Proportion of females predictions in the test dataset: ', round(y_meta_pred_test_sex_4.sum()/ len(y_meta_pred_test_sex_4), 4))
print('Proportion of ADHD predictions in the test dataset: ', round(y_meta_pred_test_adhd_4.sum()/ len(y_meta_pred_test_adhd_4), 4))


Optimal threshold sex: 0.02
Optimal threshold adhd: 0.36
Optimal (weighted) F1 sex: 0.6331
Optimal (weighted) F1 ADHD: 0.9575
Optimal overall F1: 0.7953
Number of females predictions in the test dataset:  238
Number of ADHD predictions in the test dataset:  293
Proportion of females predictions in the test dataset:  0.7829
Proportion of ADHD predictions in the test dataset:  0.9638


Create submission file

In [1159]:
# Create the submission DataFrame
submission_df = pd.DataFrame({
    'participant_id': participant_ids,
    'ADHD_Outcome': y_meta_pred_test_adhd_4,
    'Sex_F': y_meta_pred_test_sex_4
})

# Save to CSV
submission_df.to_csv(f"{data_input_folder}/Interim/submission_ensemble_logreg_f_{female_bonds}_a_{adhd_bonds}.csv", index=False)