In [34]:
#Init
import os, gc, random
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier

import joblib
import lightgbm as lgb

RND = 42
random.seed(RND)
np.random.seed(RND)

In [27]:
#Data 
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

print(train.shape, test.shape)
display(train.head())
display(train.info())
display(train.describe())

(700000, 26) (300000, 25)


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   id                                  700000 non-null  int64  
 1   age                                 700000 non-null  int64  
 2   alcohol_consumption_per_week        700000 non-null  int64  
 3   physical_activity_minutes_per_week  700000 non-null  int64  
 4   diet_score                          700000 non-null  float64
 5   sleep_hours_per_day                 700000 non-null  float64
 6   screen_time_hours_per_day           700000 non-null  float64
 7   bmi                                 700000 non-null  float64
 8   waist_to_hip_ratio                  700000 non-null  float64
 9   systolic_bp                         700000 non-null  int64  
 10  diastolic_bp                        700000 non-null  int64  
 11  heart_rate                

None

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,349999.5,50.359734,2.072411,80.230803,5.963695,7.0022,6.012733,25.874684,0.858766,116.294193,75.440924,70.167749,186.818801,53.823214,102.905854,123.08185,0.149401,0.18199,0.030324,0.623296
std,202072.738554,11.65552,1.048189,51.195071,1.463336,0.901907,2.022707,2.860705,0.03798,11.01039,6.825775,6.938722,16.730832,8.266545,19.022416,24.739397,0.356484,0.385837,0.171478,0.48456
min,0.0,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.68,91.0,51.0,42.0,117.0,21.0,51.0,31.0,0.0,0.0,0.0,0.0
25%,174999.75,42.0,1.0,49.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,175.0,48.0,89.0,106.0,0.0,0.0,0.0,0.0
50%,349999.5,50.0,2.0,71.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0,1.0
75%,524999.25,58.0,3.0,96.0,7.0,7.6,7.4,27.8,0.88,124.0,80.0,75.0,199.0,59.0,116.0,139.0,0.0,0.0,0.0,1.0
max,699999.0,89.0,9.0,747.0,9.9,9.9,16.5,38.4,1.05,163.0,104.0,101.0,289.0,90.0,205.0,290.0,1.0,1.0,1.0,1.0


In [28]:
# Cell 3 - edit names if different
ID = "id"                    # change if column name differs
TARGET = "diagnosed_diabetes"          # set actual target column name from train
features = [c for c in train.columns if c not in [ID, TARGET]]
cat_features = [
    "hypertension_history","family_history_diabetes","employment_status",
    "smoking_status","income_level","education_level","ethnicity","gender"
]
# keep only those that actually exist in dataset
cat_features = [c for c in cat_features if c in features]

# numeric = everything else
num_features = [c for c in features if c not in cat_features]

y = train[TARGET]                                # labels
x = train.drop([ID, TARGET], axis=1)             # features
x_test = test.drop(ID, axis=1)                   # test features

In [29]:
# Cell 4
print("Class balance:\n", train[TARGET].value_counts(normalize=True))
print("\nMissing per column:")
display(train[features].isna().mean().sort_values(ascending=False).head(20))

Class balance:
 diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64

Missing per column:


age                             0.0
alcohol_consumption_per_week    0.0
hypertension_history            0.0
family_history_diabetes         0.0
employment_status               0.0
smoking_status                  0.0
income_level                    0.0
education_level                 0.0
ethnicity                       0.0
gender                          0.0
triglycerides                   0.0
ldl_cholesterol                 0.0
hdl_cholesterol                 0.0
cholesterol_total               0.0
heart_rate                      0.0
diastolic_bp                    0.0
systolic_bp                     0.0
waist_to_hip_ratio              0.0
bmi                             0.0
screen_time_hours_per_day       0.0
dtype: float64

In [30]:
train_cb = train.copy()
test_cb  = test.copy()

# LightGBM label-encoded copies
train_lgb = train.copy()
test_lgb  = test.copy()

lbe_dict = {}

# Label encode categoricals
for c in cat_features:
    le = LabelEncoder()
    allv = pd.concat([train_lgb[c].astype(str), test_lgb[c].astype(str)], axis=0)
    le.fit(allv)

    train_lgb[c] = le.transform(train_lgb[c].astype(str))
    test_lgb[c]  = le.transform(test_lgb[c].astype(str))

    lbe_dict[c] = le

# Numeric NaN fill
for c in features:
    if train[c].dtype.kind in "iuf":     # numeric types
        med = train[c].median()

        train_cb[c] = train_cb[c].fillna(med)
        test_cb[c]  = test_cb[c].fillna(med)

        train_lgb[c] = train_lgb[c].fillna(med)
        test_lgb[c]  = test_lgb[c].fillna(med)

In [31]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

X_lgb = train_lgb.drop([ID, TARGET], axis=1)
X_test_lgb = test_lgb.drop(ID, axis=1)

X_cb = train_cb.drop([ID, TARGET], axis=1)
X_test_cb = test_cb.drop(ID, axis=1)

oof_lgb = np.zeros(len(X_lgb))
preds_lgb = np.zeros(len(X_test_lgb))

In [20]:
oof_lgb = np.zeros(len(X_lgb))
preds_lgb = np.zeros(len(X_test_lgb))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_lgb, y)):
    X_tr, X_val = X_lgb.iloc[train_idx], X_lgb.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model_lgb = lgb.LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.02,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        verbose=-1   # silence training output (constructor param)
    )

    model_lgb.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=0)   # no per-iteration logging
        ]
    )

    oof_lgb[val_idx] = model_lgb.predict_proba(X_val)[:, 1]
    preds_lgb += model_lgb.predict_proba(X_test_lgb)[:, 1] / skf.n_splits

print("LightGBM OOF AUC:", roc_auc_score(y, oof_lgb))

Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1999]	valid_0's auc: 0.726934	valid_0's binary_logloss: 0.582906
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1997]	valid_0's auc: 0.724937	valid_0's binary_logloss: 0.584229
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1992]	valid_0's auc: 0.726101	valid_0's binary_logloss: 0.583532
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.726955	valid_0's binary_logloss: 0.582764
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.726877	valid_0's binary_logloss: 0.583066
LightGBM OOF AUC: 0.726356506119018


In [35]:
oof_cb = np.zeros(len(X_cb))
preds_cb = np.zeros(len(X_test_cb))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_cb, y)):
    print(f"CatBoost Fold {fold} start")
    X_tr, X_val = X_cb.iloc[train_idx], X_cb.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cb = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.03,
        depth=6,
        loss_function="Logloss",
        eval_metric="AUC",
        random_seed=42,
        od_type="Iter",
        od_wait=200,
        verbose=200  # change to 0 or False to silence
    )

    cb.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=cat_features,   # names are OK with pandas DataFrame
    )

    # use best_iteration_ for stable preds
    best_it = cb.get_best_iteration()
    if best_it is None:
        # fallback if not set
        best_it = cb.get_param('iterations')  # should be 2000

    oof_cb[val_idx] = cb.predict_proba(X_val)[:, 1]
    preds_cb += cb.predict_proba(X_test_cb)[:, 1] / skf.n_splits

    fold_auc = roc_auc_score(y_val, oof_cb[val_idx])
    fold_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.5f}")

print("CatBoost CV fold AUCs:", fold_scores)
print("CatBoost OOF AUC:", roc_auc_score(y, oof_cb))

# ---------- Simple ensemble (grid search weights) ----------
# Try weights for CatBoost from 0.0 to 1.0 step 0.01
best_w = None
best_auc = -1
for w in np.linspace(0, 1, 101):
    oof_mix = w * oof_cb + (1 - w) * oof_lgb
    auc = roc_auc_score(y, oof_mix)
    if auc > best_auc:
        best_auc = auc
        best_w = w

print(f"Best ensemble weight for CatBoost: {best_w:.2f}  (CatBoost weight)")
print(f"Best ensemble OOF AUC: {best_auc:.6f}")

# create final predictions with that weight
w_cb = best_w
w_lgb = 1 - w_cb
preds_final = w_cb * preds_cb + w_lgb * preds_lgb

# ---------- Optional: save simple submission ----------
submission = sample.copy()
submission[TARGET] = preds_final
submission.to_csv("submission_ensemble.csv", index=False)
print("Saved submission_ensemble.csv — head:")
print(submission.head())

CatBoost Fold 0 start
0:	test: 0.6768924	best: 0.6768924 (0)	total: 194ms	remaining: 6m 28s
200:	test: 0.7067794	best: 0.7067794 (200)	total: 22.7s	remaining: 3m 23s
400:	test: 0.7126570	best: 0.7126579 (399)	total: 46.8s	remaining: 3m 6s
600:	test: 0.7185075	best: 0.7185075 (600)	total: 1m 12s	remaining: 2m 49s
800:	test: 0.7211639	best: 0.7211639 (800)	total: 1m 37s	remaining: 2m 25s
1000:	test: 0.7227688	best: 0.7227688 (1000)	total: 2m 1s	remaining: 2m 1s
1200:	test: 0.7239961	best: 0.7239961 (1200)	total: 2m 26s	remaining: 1m 37s
1400:	test: 0.7246034	best: 0.7246034 (1400)	total: 2m 51s	remaining: 1m 13s
1600:	test: 0.7253485	best: 0.7253485 (1600)	total: 3m 17s	remaining: 49.2s
1800:	test: 0.7257884	best: 0.7257889 (1799)	total: 3m 42s	remaining: 24.6s
1999:	test: 0.7262577	best: 0.7262615 (1998)	total: 4m 7s	remaining: 0us

bestTest = 0.7262615312
bestIteration = 1998

Shrink model to first 1999 iterations.
Fold 0 AUC: 0.72626
CatBoost Fold 1 start
0:	test: 0.6779152	best: 0.67

In [None]:
x