In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('../data/processed/train_processed.csv')

In [3]:
X = df.drop('TARGET', axis=1)
y = df['TARGET']

In [4]:
# SPLIT (Stratified to keep the imbalance ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
ratio = float(np.sum(y == 0)) / np.sum(y == 1)
print(f"Imbalance Ratio: {ratio:.2f}")

Imbalance Ratio: 11.39


DEFINE THE PARAMETER GRID

This is the "Menu" the computer can choose from

In [6]:
param_grid = {
    # How deep can each tree go?
    # Deeper = Smarter but might overthink (overfit). Shallower = Simpler.
    'max_depth': [3, 4, 5, 6, 8, 10],

    # How fast should it learn?
    # Lower number = Slow and careful learning (usually better but slower).
    'learning_rate': [0.01, 0.05, 0.1, 0.2],

    # How many trees?
    # More trees = More "opinions", but takes longer.
    'n_estimators': [100, 200, 300, 500],

    # Subsample: What % of data does each tree see?
    # If 0.8, each tree only sees 80% of data. This prevents memorization.
    'subsample': [0.7, 0.8, 0.9, 1.0],

    # Colsample: What % of columns (features) does each tree see?
    # If 0.8, each tree only sees 80% of features. Good for robustness.
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],

    # Scale Pos Weight: We test our calculated ratio, and maybe a bit higher/lower
    'scale_pos_weight': [ratio, ratio * 1.2, ratio * 0.8] 
}

print("Parameter Grid Set.")

Parameter Grid Set.


Run the Search

In [None]:
# xgb_clf = xgb.XGBClassifier(random_state=42, n_jobs=-1)

# random_search = RandomizedSearchCV(
#     estimator=xgb_clf, 
#     param_distributions=param_grid, 
#     n_iter=20,           # Try 20 random combinations (Change to 50 if you have time)
#     scoring='roc_auc',   # Optimize for AUC, not Accuracy
#     n_jobs=-1,           # Use all CPU cores
#     cv=3,                # 3-Fold Cross Validation
#     verbose=3,           # Show us progress logs
#     random_state=42
# )

In [9]:
# # START TUNING
# print("Starting Hyperparameter Tuning... (This may take a while)")
# random_search.fit(X_train, y_train)

# print("\nDONE!")
# print(f"Best Score (AUC): {random_search.best_score_:.4f}")
# print("Best Parameters:", random_search.best_params_)

SETUP THE SEARCH (SAFE MODE)

In [12]:
# n_jobs=1 here prevents the "Fork Bomb". 
# It runs models one-by-one, so your RAM stays safe.
xgb_clf = xgb.XGBClassifier(random_state=42, n_jobs=-1) # Model uses all cores

random_search = RandomizedSearchCV(
    estimator=xgb_clf, 
    param_distributions=param_grid, 
    n_iter=50,           # Reduced from 20 to 15 to be safe/faster
    scoring='roc_auc', 
    n_jobs=1,            # <--- THE CRITICAL FIX (Process 1 model at a time)
    cv=3, 
    verbose=1,           # Less noise in the logs
    random_state=42
)

In [None]:
# # START TUNING
# print("Starting Hyperparameter Tuning (Safe Mode)...")
# random_search.fit(X_train, y_train)

# print("\nDONE!")
# print(f"Best Score (AUC): {random_search.best_score_:.4f}")
# print("Best Parameters:", random_search.best_params_)

Starting Hyperparameter Tuning (Safe Mode)...
Fitting 3 folds for each of 15 candidates, totalling 45 fits

DONE!
Best Score (AUC): 0.7617
Best Parameters: {'subsample': 1.0, 'scale_pos_weight': np.float64(11.387150050352467), 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


15 iter

Starting Hyperparameter Tuning (Safe Mode)...

Fitting 3 folds for each of 15 candidates, totalling 45 fits

DONE!

Best Score (AUC): 0.7617

Best Parameters: {'subsample': 1.0, 'scale_pos_weight': np.float64(11.387150050352467), 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

In [13]:
# START TUNING
print("Starting Hyperparameter Tuning (Safe Mode)...")
random_search.fit(X_train, y_train)

print("\nDONE!")
print(f"Best Score (AUC): {random_search.best_score_:.4f}")
print("Best Parameters:", random_search.best_params_)

Starting Hyperparameter Tuning (Safe Mode)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

DONE!
Best Score (AUC): 0.7617
Best Parameters: {'subsample': 1.0, 'scale_pos_weight': np.float64(11.387150050352467), 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


# LGBM

In [14]:
import re

# 1. CLEAN COLUMN NAMES
# We use Regex to replace any character that is NOT a letter or number with '_'
clean_columns = [re.sub(r'[^\w]', '_', col) for col in X_train.columns]

# Apply to both Train and Test
X_train.columns = clean_columns
X_test.columns = clean_columns

print("Column names cleaned!")

Column names cleaned!


In [15]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [16]:
# SETUP THE CLASSIFIER
# We use the sklearn API (LGBMClassifier) because it plays nice with RandomizedSearchCV
lgb_clf = lgb.LGBMClassifier(
    objective='binary',
    metric='auc',
    is_unbalance=True,  # Handles the imbalance automatically
    n_jobs=-1,
    verbosity=-1
)

In [17]:
# DEFINE THE GRID
# These are the most important knobs for LightGBM
param_dist = {
    # How complex can the tree be? (31 is standard, 50 is complex, 20 is simple)
    'num_leaves': [20, 31, 50, 70], 
    
    # How fast to learn?
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    
    # How many trees?
    'n_estimators': [200, 500, 1000],
    
    # Minimum data in one leaf (prevents overfitting)
    'min_child_samples': [20, 50, 100],
    
    # Randomly select features/rows to make it robust
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

In [18]:
# SETUP SEARCH
random_search_lgbm = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_dist,
    n_iter=20,           # Test 20 combinations
    scoring='roc_auc',
    cv=3,                # 3-Fold Cross Validation
    verbose=1,
    random_state=42,
    n_jobs=1             # Safe mode to prevent crashing
)

In [20]:
# RUN TUNING
print("Tuning LightGBM Champion...")
# Make sure X_train still has the "Clean Names" from the previous step!
random_search_lgbm.fit(X_train, y_train)

print("\nDONE!")
print(f"Best LightGBM Score: {random_search_lgbm.best_score_:.4f}")
print("Best Parameters:", random_search_lgbm.best_params_)

Tuning LightGBM Champion...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

DONE!
Best LightGBM Score: 0.7631
Best Parameters: {'subsample': 0.9, 'num_leaves': 31, 'n_estimators': 1000, 'min_child_samples': 50, 'learning_rate': 0.01, 'colsample_bytree': 0.9}
