In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [3]:
# --- 1. Load Data ---
# Define the file paths based on your Kaggle notebook structure
TRAIN_FILE = "/kaggle/input/playground-series-s5e12/train.csv"
TEST_FILE = "/kaggle/input/playground-series-s5e12/test.csv"
SUB_FILE = "/kaggle/input/playground-series-s5e12/sample_submission.csv"

In [4]:
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
submission_df = pd.read_csv(SUB_FILE)

In [18]:
train_df.columns


Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [19]:
test_df.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history'],
      dtype='object')

In [6]:
# Separate features (X) and target (y)
TARGET = 'diagnosed_diabetes'
X = train_df.drop(['id', TARGET], axis=1)
y = train_df[TARGET]
X_test = test_df.drop('id', axis=1)

# Identify categorical columns (usually 'object' dtype in pandas)
cat_cols = X.select_dtypes(include=['object']).columns


# Simple Label Encoding for LightGBM to handle categories natively
# We must fit the encoder on the combined train/test data to avoid unseen labels in test
for col in cat_cols:
    le = LabelEncoder()
    # Combine columns to ensure all unique categories are encoded consistently
    combined = pd.concat([X[col], X_test[col]]).astype(str).fillna('missing')
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str).fillna('missing'))
    X_test[col] = le.transform(X_test[col].astype(str).fillna('missing'))
    # Set the dtype back to 'category' for LightGBM to utilize its native handling
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [12]:
# --- 3. Model Configuration (LightGBM with GPU) ---

# LightGBM Parameters - Optimized for Binary Classification and GPU usage
# These parameters are a good starting point for high AUC.
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 6000,           # High number, but Early Stopping controls actual training
    'learning_rate': 0.01,          # Slower learning rate usually gives better generalization
    'num_leaves': 31,               # Controls tree complexity
    'max_depth': -1,                # No limit on tree depth
    'colsample_bytree': 0.7,        # Feature subsampling
    'subsample': 0.7,               # Data subsampling (bagging)
    'random_state': 42,
    
    # --- GPU Configuration ---
    # This enables GPU acceleration. Ensure your Kaggle notebook accelerator is ON (P100 or T4).
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    # -------------------------
}

# --- 4. Stratified K-Fold Cross-Validation ---
FOLDS = 5 # A good balance between stability and training time
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions and final test predictions
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
cv_scores = []

print("--- Starting LightGBM Cross-Validation Training ---")

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n⚡️ Fold {fold+1}/{FOLDS}")
    
    # Split data for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize the model
    model = lgb.LGBMClassifier(**lgb_params)

    # Train the model with early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=100)]
    )

    # Predict on the validation set for OOF prediction
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds[val_index] = val_preds
    
    # Calculate and store the AUC for this fold
    fold_auc = roc_auc_score(y_val, val_preds)
    cv_scores.append(fold_auc)
    print(f"Fold {fold+1} AUC: {fold_auc:.5f}")

    # Predict on the test set and accumulate (Averaging predictions over all folds)
    test_preds += model.predict_proba(X_test)[:, 1] / FOLDS

--- Starting LightGBM Cross-Validation Training ---

⚡️ Fold 1/5
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 24
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 19 dense feature groups (10.68 MB) transferred to GPU in 0.012027 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5738]	valid_0's auc: 0.728041
Fold 1 AUC: 0.7

In [13]:
# Calculate the final CV score
mean_auc = np.mean(cv_scores)
print("\n" + "="*40)
print(f"OVERALL OOF AUC: {mean_auc:.5f} (Target Metric)")
print("="*40)


OVERALL OOF AUC: 0.72749 (Target Metric)


In [14]:
# Create the submission file
submission_df[TARGET] = test_preds

# Save the submission file
submission_file_path = "submission.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"\n✅ Submission file '{submission_file_path}' created successfully!")
print(f"   Shape: {submission_df.shape}")


✅ Submission file 'submission.csv' created successfully!
   Shape: (300000, 2)


In [20]:
# ... (Previous code for loading data and combining into df_all)
from sklearn.preprocessing import LabelEncoder
# ...

# --- CRITICAL FEATURE ENGINEERING: Zero-Imputation ---
# The columns in YOUR dataset where 0 is suspicious or requires careful handling
zero_cols = ['bmi', 'systolic_bp', 'diastolic_bp', 
             'alcohol_consumption_per_week', 'physical_activity_minutes_per_week']

for col in zero_cols:
    # 1. Create a Missing Indicator feature for original zero values
    # This helps the model distinguish between an imputed value and a genuine measurement
    df_all[f'{col}_is_zero'] = (df_all[col] == 0).astype(int)
    
    # 2. Replace the suspicious 0s with NaN for proper median imputation
    # Note: We replace 0s with NaN only for 'bmi', 'systolic_bp', 'diastolic_bp'
    if col in ['bmi', 'systolic_bp', 'diastolic_bp']:
        df_all[col] = df_all[col].replace(0, np.nan)

# 3. Impute NaN values with the median (best for robustness)
for col in ['bmi', 'systolic_bp', 'diastolic_bp']:
    median_val = df_all[col].median()
    df_all[col] = df_all[col].fillna(median_val)
    print(f"Imputed {col} NaNs with median: {median_val:.2f}")


# --- SECONDARY FEATURE ENGINEERING: Interactions & Ratios ---
# 4. Create Interaction Features with YOUR column names
df_all['BMI_WHR_Ratio'] = df_all['bmi'] / df_all['waist_to_hip_ratio']
df_all['Mean_BP'] = (df_all['systolic_bp'] + df_all['diastolic_bp']) / 2
df_all['Cholesterol_Ratio'] = df_all['ldl_cholesterol'] / df_all['hdl_cholesterol']


# --- 2. Simple Preprocessing: Handle Categorical Features (Adjusted) ---
# Identify categorical columns (object dtype)
cat_cols = df_all.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    # Fill any remaining NaN in categorical features before encoding
    df_all[col] = df_all[col].astype(str).fillna('missing')
    le.fit(df_all[col])
    df_all[col] = le.transform(df_all[col])
    df_all[col] = df_all[col].astype('category')

# Split back into training and test sets
X = df_all.iloc[:len(train_df)].drop('id', axis=1)
X_test = df_all.iloc[len(train_df):].drop('id', axis=1)

Imputed bmi NaNs with median: 25.90
Imputed systolic_bp NaNs with median: 116.00
Imputed diastolic_bp NaNs with median: 75.00


In [21]:
# --- 4. Optimized Model Configuration (LightGBM with GPU) ---
lgb_params_optimized = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 10000,          # Increased max capacity
    'learning_rate': 0.008,         # Slower, more accurate learning
    'num_leaves': 95,               # Increased complexity
    'max_depth': -1,
    'min_child_samples': 10,        # Allows for finer splits
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'scale_pos_weight': 0.604,      # Handles class imbalance (38% Neg / 62% Pos)
    'random_state': 42,
    
    # --- GPU Configuration ---
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    # -------------------------
}

# --- 5. Stratified K-Fold Cross-Validation & Training ---
FOLDS = 5 
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
cv_scores = []

print("\n--- Starting Optimized LightGBM Training (Expected AUC >= 0.75) ---")

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n⚡️ Fold {fold+1}/{FOLDS}")
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = lgb.LGBMClassifier(**lgb_params_optimized)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=100)] # Increased stopping rounds for stability
    )

    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds[val_index] = val_preds
    
    fold_auc = roc_auc_score(y_val, val_preds)
    cv_scores.append(fold_auc)
    print(f"Fold {fold+1} AUC: {fold_auc:.5f}")

    test_preds += model.predict_proba(X_test)[:, 1] / FOLDS


--- Starting Optimized LightGBM Training (Expected AUC >= 0.75) ---

⚡️ Fold 1/5
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2259
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 27
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (12.82 MB) transferred to GPU in 0.015845 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[5145]	valid_0's auc: 0.72787

In [23]:
# --- 6. Evaluate and Submit ---
mean_auc = np.mean(cv_scores)
print("\n" + "="*40)
print(f"FINAL OVERALL OOF AUC: {mean_auc:.5f}")
print("="*40)

# Create the submission file
submission_df = pd.DataFrame({'id': test_df['id'], TARGET: test_preds})
submission_file_path = "submission_optimized_final.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"\n✅ Submission file '{submission_file_path}' created successfully!")


FINAL OVERALL OOF AUC: 0.72727

✅ Submission file 'submission_optimized_final.csv' created successfully!
