### Setup & Imports

In [1]:
# Data handling
import pandas as pd
import numpy as np
import yaml
import joblib
import time

# Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score
import lightgbm as lgb
import xgboost as xgb

# MLflow
import mlflow
import mlflow.sklearn

# Warnings
import warnings
warnings.filterwarnings("ignore")

### Load Config

In [2]:
# Load YAML config
with open("../../configs/training.yml", "r") as f:
    config = yaml.safe_load(f)

print(config)

{'models': [{'name': 'lightgbm', 'type': 'lightgbm', 'max_depth': 7, 'learning_rate': 0.05, 'n_estimators': 500}, {'name': 'xgboost', 'type': 'xgboost', 'max_depth': 7, 'learning_rate': 0.05, 'n_estimators': 500}], 'metrics': {'primary': 'roc_auc', 'secondary': 'f1_score'}, 'thresholds': {'risk_flag': 0.7}, 'hyperparameter_search': {'method': 'gridsearch', 'lightgbm': {'num_leaves': [31, 50, 70], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 300, 500]}, 'xgboost': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 300, 500]}}, 'logging': {'experiment': 'dropout_prediction', 'tool': 'mlflow'}}


### Load Preprocessed Data

In [3]:
X_train = pd.read_csv("../../data/processed/X_train.csv")
y_train = pd.read_csv("../../data/processed/y_train.csv").values.ravel()
X_val = pd.read_csv("../../data/processed/X_val.csv")
y_val = pd.read_csv("../../data/processed/y_val.csv").values.ravel()

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)

Train shape: (3539, 36) (3539,)
Validation shape: (442, 36) (442,)


### Define Models & Hyperparameters

In [4]:
# LightGBM
lgb_config = next(m for m in config['models'] if m['type'] == 'lightgbm')
lgb_model = lgb.LGBMClassifier(
    max_depth=lgb_config['max_depth'],
    learning_rate=lgb_config['learning_rate'],
    n_estimators=lgb_config['n_estimators']
)
lgb_params = config['hyperparameter_search']['lightgbm']

In [5]:
# XGBoost
xgb_config = next(m for m in config['models'] if m['type'] == 'xgboost')
xgb_model = xgb.XGBClassifier(
    max_depth=xgb_config['max_depth'],
    learning_rate=xgb_config['learning_rate'],
    n_estimators=xgb_config['n_estimators'],
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_params = config['hyperparameter_search']['xgboost']

### Initialize MLflow

In [6]:
mlflow.set_experiment(config['logging']['experiment'])

2025/10/06 11:48:42 INFO mlflow.tracking.fluent: Experiment with name 'dropout_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/Student_ropout_risk/src/models/mlruns/818261885524770619', creation_time=1759731522755, experiment_id='818261885524770619', last_update_time=1759731522755, lifecycle_stage='active', name='dropout_prediction', tags={}>

### Train and Log LightGBM

In [7]:
with mlflow.start_run(run_name="LightGBM"):
    start = time.time()
    
    # GridSearchCV for hyperparameter tuning
    lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=3, scoring='roc_auc', n_jobs=-1) # type: ignore
    lgb_grid.fit(X_train, y_train)
    
    # Predict on validation
    y_val_pred = lgb_grid.predict(X_val)
    y_val_proba = lgb_grid.predict_proba(X_val)[:,1]
    
    # Metrics
    roc_auc = roc_auc_score(y_val, lgb_grid.predict_proba(X_val), multi_class='ovr')
    f1 = f1_score(y_val, y_val_pred, average='macro')
    end = time.time()
    
    # Log
    mlflow.log_params(lgb_grid.best_params_)
    mlflow.log_metric("roc_auc", roc_auc) # type: ignore
    mlflow.log_metric("f1_score", f1) # type: ignore
    mlflow.log_metric("train_time_sec", end-start)
    mlflow.sklearn.log_model(lgb_grid.best_estimator_, "model") # type: ignore
    
    print(f"LightGBM - ROC AUC: {roc_auc:.4f}, F1: {f1:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1049
[LightGBM] [Info] Number of data points in the train set: 3539, number of used features: 36
[LightGBM] [Info] Start training from score -1.135451
[LightGBM] [Info] Start training from score -1.717974
[LightGBM] [Info] Start training from score -0.694561




LightGBM - ROC AUC: 0.8839, F1: 0.6304


### Train and Log XGBoost

In [8]:
with mlflow.start_run(run_name="XGBoost"):
    start = time.time()
    
    # GridSearchCV for hyperparameter tuning
    xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=3, scoring='roc_auc', n_jobs=-1)
    xgb_grid.fit(X_train, y_train)
    
    # Predict on validation
    y_val_pred = xgb_grid.predict(X_val)
    y_val_proba = xgb_grid.predict_proba(X_val)[:,1]
    
    # Metrics
    roc_auc = roc_auc_score(y_val, xgb_grid.predict_proba(X_val), multi_class='ovr')
    f1 = f1_score(y_val, y_val_pred, average='macro')
    end = time.time()
    
    # Log
    mlflow.log_params(xgb_grid.best_params_)
    mlflow.log_metric("roc_auc", roc_auc) # type: ignore
    mlflow.log_metric("f1_score", f1) # type: ignore
    mlflow.log_metric("train_time_sec", end-start)
    mlflow.sklearn.log_model(xgb_grid.best_estimator_, "model") # type: ignore
    
    print(f"XGBoost - ROC AUC: {roc_auc:.4f}, F1: {f1:.4f}")




XGBoost - ROC AUC: 0.8636, F1: 0.6695


### Compare Models & Select Best|

In [9]:
# Example: Compare ROC-AUC on validation
results = {
    "LightGBM": roc_auc_score(y_val, lgb_grid.predict_proba(X_val), multi_class='ovr'),
    "XGBoost": roc_auc_score(y_val, xgb_grid.predict_proba(X_val), multi_class='ovr')
}

best_model_name = max(results, key=results.get) # type: ignore
best_model = lgb_grid.best_estimator_ if best_model_name=="LightGBM" else xgb_grid.best_estimator_

print(f"Best Model: {best_model_name} with ROC-AUC = {results[best_model_name]:.4f}")

# Save best model
joblib.dump(best_model, "../../models/best_model.pkl")

Best Model: LightGBM with ROC-AUC = 0.8839


['../../models/best_model.pkl']