### The goal of this notebook is to find the best parameter for the XGBoost model.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import warnings  # To suppress noisy warnings while keeping the code beginner-friendly
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt  # Plotting
import seaborn as sns  # Nice statistical plots

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split  # Robust CV on imbalanced/shifted targets
from sklearn.metrics import roc_auc_score, log_loss  # Key metrics for probabilistic classifiers
from sklearn.preprocessing import StandardScaler  # Scaling numeric features
from sklearn.impute import SimpleImputer  # Safe imputation for missing values
from sklearn.compose import ColumnTransformer  # Apply different preprocessing to num/cat
from sklearn.pipeline import Pipeline  # Chain preprocessing + model safely
from sklearn.calibration import CalibratedClassifierCV  # Probability calibration for better log-loss
from sklearn.metrics import RocCurveDisplay  # Simple ROC plot helper

# Weâ€™ll use LightGBM for strong tabular performance and robustness.
# If not installed: pip install lightgbm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


# Load the data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')

In [3]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [4]:
target = 'diagnosed_diabetes'
cat_cols = test.select_dtypes(exclude='number').columns.to_list()
base = [col for col in train.columns if col not in [target]]
num_cols = [col for col in base if col not in cat_cols]

### We are going to find the best parameters for XGBoost using Optuna so we are going to take a sample size of 100,000 for faster calculation.

In [5]:
train = train.sample(100_000)
print(train.shape)

(100000, 25)


In [6]:
for col in cat_cols:
    train[col] = train[col].map(train[col].value_counts())
    test[col] = test[col].map(train[col].value_counts()) 

In [7]:
X = train[base]
y = train[target]

In [8]:
n_folds = 5
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'alpha': trial.suggest_float('alpha', 1e-3, 10, log=True),
        'lambda': trial.suggest_float('lambda', 1e-3, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10, log=True),
        'device' : 'cuda',   
    }
    
    xgb = XGBClassifier(**params, random_state=42, n_jobs=-1, verbosity=0)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    score = cross_val_score(xgb, X_train, y_train, cv=skf, scoring='roc_auc')
    return score.mean()

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)  
study = optuna.create_study(direction='maximize', pruner=pruner, study_name='XGB-ROC-Optimization')
study.optimize(objective, n_trials=15, show_progress_bar=True)

[I 2025-12-27 06:33:21,277] A new study created in memory with name: XGB-ROC-Optimization


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2025-12-27 06:33:44,754] Trial 0 finished with value: 0.7043766120774215 and parameters: {'n_estimators': 1702, 'learning_rate': 0.001347233913442654, 'max_depth': 6, 'alpha': 0.04391984897749723, 'lambda': 0.0023096748420112093, 'subsample': 0.6662362987324204, 'colsample_bytree': 0.6114834913709533, 'min_child_weight': 1}. Best is trial 0 with value: 0.7043766120774215.
[I 2025-12-27 06:34:40,434] Trial 1 finished with value: 0.7090830102150545 and parameters: {'n_estimators': 1904, 'learning_rate': 0.0017460324653779038, 'max_depth': 9, 'alpha': 0.013492722287463602, 'lambda': 0.644988274468218, 'subsample': 0.594099032209215, 'colsample_bytree': 0.9386244876024805, 'min_child_weight': 4}. Best is trial 1 with value: 0.7090830102150545.
[I 2025-12-27 06:35:18,316] Trial 2 finished with value: 0.7128008252833713 and parameters: {'n_estimators': 3138, 'learning_rate': 0.004889836148624854, 'max_depth': 6, 'alpha': 0.06251847192190545, 'lambda': 0.1333732074029522, 'subsample': 0.88

In [12]:
print(f'Best cross-validation ROC AUC: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')

Best cross-validation ROC AUC: 0.71507
Best parameters: {'n_estimators': 4123, 'learning_rate': 0.012583005624154356, 'max_depth': 3, 'alpha': 0.006122955361272888, 'lambda': 4.979756782201132, 'subsample': 0.8246846548997797, 'colsample_bytree': 0.5073072612487087, 'min_child_weight': 2}
