In [1]:
#stacking.csv gave score 48
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

In [3]:
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})
train_clean = train.dropna(subset=['age_group'])
X = train_clean.drop(['SEQN', 'age_group'], axis=1)
y = train_clean['age_group']
X_test = test.drop(['SEQN'], axis=1)

In [4]:
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

In [5]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
scale_pos_weight = class_weights[1] / class_weights[0]

In [6]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_imputed, y)

In [7]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_preds = np.zeros(X_test_imputed.shape[0])
cat_preds = np.zeros(X_test_imputed.shape[0])
lgbm_preds = np.zeros(X_test_imputed.shape[0])
cv_f1_scores = []
val_probs = []
val_targets = []

In [8]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X_balanced, y_balanced)):
    print(f"\n🔹 Fold {fold+1}")
    X_train, X_val = X_balanced[train_idx], X_balanced[val_idx]
    y_train, y_val = y_balanced[train_idx], y_balanced[val_idx]

    # XGBoost
    xgb = XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric='logloss',
        early_stopping_rounds=30,
        random_state=42
    )
    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    val_pred = xgb.predict(X_val)
    f1 = f1_score(y_val, val_pred)
    cv_f1_scores.append(f1)
    print(f"XGBoost Fold F1: {f1:.4f}")
    xgb_preds += xgb.predict_proba(X_test_imputed)[:, 1] / kf.n_splits

    # CatBoost
    cat = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.03,
        loss_function='Logloss',
        auto_class_weights='Balanced',
        early_stopping_rounds=30,
        verbose=0,
        random_seed=42
    )
    cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    val_pred_cat = cat.predict(X_val)
    f1_cat = f1_score(y_val, val_pred_cat)
    print(f"CatBoost Fold F1: {f1_cat:.4f}")
    cat_preds += cat.predict_proba(X_test_imputed)[:, 1] / kf.n_splits
    
    # LightGBM
    lgbm = LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        class_weight='balanced',
        random_state=42
    )
    lgbm.fit(X_train, y_train)
    val_pred_lgbm = lgbm.predict(X_val)
    f1_lgbm = f1_score(y_val, val_pred_lgbm)
    print(f"LightGBM Fold F1: {f1_lgbm:.4f}")
    lgbm_preds += lgbm.predict_proba(X_test_imputed)[:, 1] / kf.n_splits
    
    val_probs.extend(lgbm.predict_proba(X_val)[:, 1])
    val_targets.extend(y_val)


🔹 Fold 1


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGBoost Fold F1: 0.8313
CatBoost Fold F1: 0.8463
[LightGBM] [Info] Number of positive: 1310, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 2620, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Fold F1: 0.8372

🔹 Fold 2


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGBoost Fold F1: 0.8469
CatBoost Fold F1: 0.8652
[LightGBM] [Info] Number of positive: 1311, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1385
[LightGBM] [Info] Number of data points in the train set: 2621, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


LightGBM Fold F1: 0.8838

🔹 Fold 3
XGBoost Fold F1: 0.8357
CatBoost Fold F1: 0.8411
[LightGBM] [Info] Number of positive: 1311, number of negative: 1310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 2621, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


LightGBM Fold F1: 0.8567

🔹 Fold 4
XGBoost Fold F1: 0.8548
CatBoost Fold F1: 0.8593
[LightGBM] [Info] Number of positive: 1310, number of negative: 1311
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1387
[LightGBM] [Info] Number of data points in the train set: 2621, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Fold F1: 0.8802

🔹 Fold 5


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGBoost Fold F1: 0.8575
CatBoost Fold F1: 0.8657
[LightGBM] [Info] Number of positive: 1310, number of negative: 1311
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1386
[LightGBM] [Info] Number of data points in the train set: 2621, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Fold F1: 0.8814




In [9]:
val_probs = np.array(val_probs)
val_targets = np.array(val_targets)
best_f1 = 0
best_thresh = 0.5
for thresh in np.arange(0.3, 0.7, 0.01):
    preds = (val_probs > thresh).astype(int)
    score = f1_score(val_targets, preds)
    if score > best_f1:
        best_f1 = score
        best_thresh = thresh
print(f"\n Best threshold: {best_thresh:.2f} | Best F1: {best_f1:.4f}")


 Best threshold: 0.52 | Best F1: 0.8717


In [10]:
base_models = [
    ('xgb', XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.03,
        scale_pos_weight=scale_pos_weight,
        random_state=42
    )),
    ('cat', CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.03,
        auto_class_weights='Balanced',
        verbose=0,
        random_seed=42
    )),
    ('lgbm', LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.03,
        class_weight='balanced',
        random_state=42
    ))
]

stack = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

In [11]:
stack.fit(X_balanced, y_balanced)
stack_preds = stack.predict(X_test_imputed)
stack_submission = sample_submission.copy()
stack_submission['age_group'] = stack_preds
stack_submission.to_csv("stacking_submission.csv", index=False)

