In [None]:
#Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier, early_stopping
from datetime import datetime


INPUT_DATA_PATH = "H:/widsdatathon2025" # Replace your input data path here
# -------------------------
# Load data
# -------------------------
cat_fea = [
    'participant_id', 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
    'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
    'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
    'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ'
]

def get_feats(mode='train'):
    if mode == 'TRAIN_NEW':
        feats = pd.read_excel(f"{INPUT_DATA_PATH}/{mode}/TRAIN_QUANTITATIVE_METADATA_new.xlsx")
    else:
        feats = pd.read_excel(f"{INPUT_DATA_PATH}/{mode}/{mode}_QUANTITATIVE_METADATA.xlsx")

    if mode == 'TRAIN_NEW':
        cate = pd.read_excel(f"{INPUT_DATA_PATH}/{mode}/TRAIN_CATEGORICAL_METADATA_new.xlsx")
    else:
        cate = pd.read_excel(f"{INPUT_DATA_PATH}/{mode}/{mode}_CATEGORICAL.xlsx")

    cate = cate.loc[:, cat_fea]
    feats = feats.merge(cate, on='participant_id', how='left')

    if mode == 'TEST':
        func = pd.read_csv(f"{INPUT_DATA_PATH}/{mode}/{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv")
    else:
        func = pd.read_csv(f"{INPUT_DATA_PATH}/{mode}/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")

    global func_fea
    func_fea = func.columns.to_list()
    func_fea.remove('participant_id')
    feats = feats.merge(func, on='participant_id', how='left')

    if mode == 'TRAIN_NEW':
        solution = pd.read_excel("{INPUT_DATA_PATH}/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")
        feats = feats.merge(solution, on='participant_id', how='left')

    return feats

train = get_feats(mode='TRAIN_NEW')
test = get_feats(mode='TEST')

# -------------------------
# One-hot encode categoricals
# -------------------------
cat_fea.remove('participant_id')
train = pd.get_dummies(train, columns=cat_fea)
test = pd.get_dummies(test, columns=cat_fea)
train, test = train.align(test, join='left', axis=1, fill_value=0)

# -------------------------
# Prepare features
# -------------------------
exclude_cols = func_fea + ['participant_id', 'ADHD_Outcome', 'Sex_F']
metadata_features = [col for col in train.columns if col not in exclude_cols]




## train_ensemble_with_cv

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from datetime import datetime

# Threshold tuner
def find_best_threshold(y_true, y_probs):
    thresholds = np.linspace(0.1, 0.9, 81)
    best_f1 = 0
    best_thresh = 0.5
    for t in thresholds:
        preds = (y_probs >= t).astype(int)
        f1 = f1_score(y_true, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t
    return best_thresh

# PCA selector
def select_pca_components(X, explained_var_threshold=0.90):
    pca = PCA(n_components=None, random_state=42)
    pca.fit(X)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumulative_variance >= explained_var_threshold) + 1
    return n_components

# Main ensemble CV trainer
def train_ensemble_with_cv(train, test, target_col, func_fea, metadata_features):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    val_preds = np.zeros(train.shape[0])
    test_preds_proba = np.zeros(test.shape[0])
    thresholds = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train[target_col])):
        print(f"\n PCA Fold {fold}")
        X_train_func = train.loc[train_idx, func_fea]
        X_val_func = train.loc[val_idx, func_fea]
        X_test_func = test[func_fea]

        n_components = select_pca_components(X_train_func, explained_var_threshold=0.90)
        print("Number of components selected:", n_components)

        pca = PCA(n_components=n_components, random_state=42)
        X_train_pca = pca.fit_transform(X_train_func)
        X_val_pca = pca.transform(X_val_func)
        X_test_pca = pca.transform(X_test_func)

        X_train_meta = train.loc[train_idx, metadata_features].values
        X_val_meta = train.loc[val_idx, metadata_features].values
        X_test_meta = test[metadata_features].values

        # Impute metadata to handle NaNs (required for RidgeClassifier)
        imputer = SimpleImputer(strategy='mean')
        X_train_meta = imputer.fit_transform(X_train_meta)
        X_val_meta = imputer.transform(X_val_meta)
        X_test_meta = imputer.transform(X_test_meta)
        
        # Combine metadata + PCA features
        X_train_final = np.hstack([X_train_meta, X_train_pca])
        X_val_final = np.hstack([X_val_meta, X_val_pca])
        X_test_final = np.hstack([X_test_meta, X_test_pca])

        y_train = train.loc[train_idx, target_col].values
        y_val = train.loc[val_idx, target_col].values

        # === Models ===
        

        model = VotingClassifier(estimators=[
            ('lgbm', LGBMClassifier(device='gpu', n_estimators=800, learning_rate=0.03, class_weight='balanced', random_state=42)),
            ('xgb', XGBClassifier(device='cuda', predictor='gpu_predictor',tree_method='hist', use_label_encoder=False,
                                  eval_metric='logloss', learning_rate=0.03, n_estimators=800, scale_pos_weight=1.5, random_state=42)),
            ('cat', CatBoostClassifier(iterations=800, learning_rate=0.03, depth=6, task_type="GPU", devices="0",
                                       class_weights=[1, 2], verbose=0, random_seed=42)),
            ('logreg', LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear'))],
          voting='soft', n_jobs=-1)


        model.fit(X_train_final, y_train)

        # Predict proba and tune threshold
        val_probs = model.predict_proba(X_val_final)[:, 1]
        best_thresh = find_best_threshold(y_val, val_probs)
        thresholds.append(best_thresh)
        val_pred = (val_probs >= best_thresh).astype(int)
        val_preds[val_idx] = val_pred

        test_probs = model.predict_proba(X_test_final)[:, 1]
        test_preds_proba += test_probs

        f1_val = f1_score(y_val, val_pred)
        print(f"Fold {fold} - Val F1: {f1_val:.4f}")

    # Final threshold (mean of bbest thresholds)
    final_threshold = np.mean(thresholds)
    print(f"\n Final Threshold: {final_threshold:.4f}")

    test_preds_proba /= skf.n_splits
    test_preds = (test_preds_proba >= final_threshold).astype(int)

    # Final CV evaluation
    y_true = train[target_col]
    print("\n Final CV Results")
    print(f"Accuracy: {accuracy_score(y_true, val_preds):.4f}")
    print(f"F1 Score: {f1_score(y_true, val_preds):.4f}")
    print(classification_report(y_true, val_preds))

    return test_preds, val_preds


In [9]:
adhd_preds, adhd_val = train_ensemble_with_cv(train, test, 'ADHD_Outcome', func_fea, metadata_features)


✅ PCA Fold 0


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Fold 0 - Val F1: 0.9101

✅ PCA Fold 1
Fold 1 - Val F1: 0.8667

✅ PCA Fold 2
Fold 2 - Val F1: 0.8659

✅ PCA Fold 3
Fold 3 - Val F1: 0.8835

✅ PCA Fold 4
Fold 4 - Val F1: 0.8412

🎯 Final Threshold: 0.2700

📊 Final CV Results
Accuracy: 0.8120
F1 Score: 0.8735
              precision    recall  f1-score   support

           0       0.82      0.52      0.63       382
           1       0.81      0.95      0.87       831

    accuracy                           0.81      1213
   macro avg       0.81      0.73      0.75      1213
weighted avg       0.81      0.81      0.80      1213



In [10]:
sex_preds, sex_val = train_ensemble_with_cv(train, test, 'Sex_F', func_fea, metadata_features)


✅ PCA Fold 0
Fold 0 - Val F1: 0.5968

✅ PCA Fold 1
Fold 1 - Val F1: 0.6102

✅ PCA Fold 2
Fold 2 - Val F1: 0.6848

✅ PCA Fold 3
Fold 3 - Val F1: 0.6073

✅ PCA Fold 4
Fold 4 - Val F1: 0.6667

🎯 Final Threshold: 0.2560

📊 Final CV Results
Accuracy: 0.6810
F1 Score: 0.6297
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       797
           1       0.52      0.79      0.63       416

    accuracy                           0.68      1213
   macro avg       0.69      0.71      0.67      1213
weighted avg       0.74      0.68      0.69      1213



In [11]:



submission = pd.DataFrame({
    "participant_id": test["participant_id"],
    "ADHD_Outcome": adhd_preds,
    "Sex_F": sex_preds
})

filename = f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
submission.to_csv(filename, index=False)
print(f"\n✅ Saved: {filename}")



✅ Saved: submission_20250503_224252.csv
