# 06 - Model Training

**Objective**: Train and tune churn prediction models

**Models**:
- Logistic Regression (interpretable)
- XGBoost (performance)
- LightGBM (performance)

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

print('Libraries loaded!')

In [None]:
# Paths
MODEL_INPUT_PATH = Path('../data/05_model_input')
MODEL_PATH = Path('../data/06_models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# Load data
df = pd.read_csv(MODEL_INPUT_PATH / 'selected_train.csv')

TARGET = 'Churn'
with open(MODEL_PATH / 'feature_list.json', 'r') as f:
    FEATURES = json.load(f)

print(f"Features: {len(FEATURES)}")
print(f"Samples: {len(df):,}")

## 1. Train-Test Split (70/15/15)

In [None]:
X = df[FEATURES]
y = df[TARGET]

# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Second split: 50/50 of temp = 15/15 of total
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(" DATA SPLIT:")
print(f"Train: {len(X_train):,} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val:   {len(X_val):,} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test:  {len(X_test):,} ({len(X_test)/len(X)*100:.1f}%)")

print(f"\n CHURN RATES:")
print(f"Train: {y_train.mean()*100:.2f}%")
print(f"Val:   {y_val.mean()*100:.2f}%")
print(f"Test:  {y_test.mean()*100:.2f}%")

## 2. SMOTE on Training Set

In [None]:
# Apply SMOTE to training data only
smote = SMOTE(sampling_strategy=0.6, random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {len(X_train):,} samples")
print(f"After SMOTE:  {len(X_train_smote):,} samples")
print(f"New churn rate: {y_train_smote.mean()*100:.2f}%")

## 3. Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, MODEL_PATH / 'scaler.pkl')
print(" Saved: scaler.pkl")

## 4. Model 1: Logistic Regression (Interpretable)

In [None]:
print(" Training Logistic Regression...")

# Hyperparameter grid
lr_params = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None]
}

lr = LogisticRegression(max_iter=1000, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lr_search = GridSearchCV(
    lr, lr_params, cv=cv, scoring='average_precision', n_jobs=-1, verbose=1
)
lr_search.fit(X_train_scaled, y_train_smote)

lr_best = lr_search.best_estimator_
print(f"\n Best params: {lr_search.best_params_}")
print(f" Best CV AUC-PR: {lr_search.best_score_:.4f}")

In [None]:
# Evaluate on validation
lr_proba = lr_best.predict_proba(X_val_scaled)[:, 1]
lr_val_auc = roc_auc_score(y_val, lr_proba)
lr_val_pr = average_precision_score(y_val, lr_proba)

print(f"Validation ROC-AUC: {lr_val_auc:.4f}")
print(f"Validation PR-AUC:  {lr_val_pr:.4f}")

# Save model
joblib.dump(lr_best, MODEL_PATH / 'logistic_regression.pkl')
print("\n Saved: logistic_regression.pkl")

## 5. Model 2: XGBoost (Performance)

In [None]:
print(" Training XGBoost...")

# Calculate scale_pos_weight
scale_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'scale_pos_weight': [1, scale_weight]
}

xgb = XGBClassifier(random_state=42, eval_metric='aucpr', use_label_encoder=False)

xgb_search = GridSearchCV(
    xgb, xgb_params, cv=cv, scoring='average_precision', n_jobs=-1, verbose=1
)
xgb_search.fit(X_train_smote, y_train_smote)

xgb_best = xgb_search.best_estimator_
print(f"\n Best params: {xgb_search.best_params_}")
print(f" Best CV AUC-PR: {xgb_search.best_score_:.4f}")

In [None]:
# Evaluate on validation
xgb_proba = xgb_best.predict_proba(X_val)[:, 1]
xgb_val_auc = roc_auc_score(y_val, xgb_proba)
xgb_val_pr = average_precision_score(y_val, xgb_proba)

print(f"Validation ROC-AUC: {xgb_val_auc:.4f}")
print(f"Validation PR-AUC:  {xgb_val_pr:.4f}")

# Save model
joblib.dump(xgb_best, MODEL_PATH / 'xgboost.pkl')
print("\n Saved: xgboost.pkl")

## 6. Model 3: LightGBM (Performance)

In [None]:
print(" Training LightGBM...")

lgb_params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'max_depth': [5, 10, -1],
    'n_estimators': [100, 200],
    'class_weight': ['balanced', None]
}

lgb = LGBMClassifier(random_state=42, verbosity=-1)

lgb_search = GridSearchCV(
    lgb, lgb_params, cv=cv, scoring='average_precision', n_jobs=-1, verbose=1
)
lgb_search.fit(X_train_smote, y_train_smote)

lgb_best = lgb_search.best_estimator_
print(f"\n Best params: {lgb_search.best_params_}")
print(f" Best CV AUC-PR: {lgb_search.best_score_:.4f}")

In [None]:
# Evaluate on validation
lgb_proba = lgb_best.predict_proba(X_val)[:, 1]
lgb_val_auc = roc_auc_score(y_val, lgb_proba)
lgb_val_pr = average_precision_score(y_val, lgb_proba)

print(f"Validation ROC-AUC: {lgb_val_auc:.4f}")
print(f"Validation PR-AUC:  {lgb_val_pr:.4f}")

# Save model
joblib.dump(lgb_best, MODEL_PATH / 'lightgbm.pkl')
print("\n Saved: lightgbm.pkl")

## 7. Model Comparison

In [None]:
# Compile results
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost', 'LightGBM'],
    'CV_AUC_PR': [lr_search.best_score_, xgb_search.best_score_, lgb_search.best_score_],
    'Val_ROC_AUC': [lr_val_auc, xgb_val_auc, lgb_val_auc],
    'Val_PR_AUC': [lr_val_pr, xgb_val_pr, lgb_val_pr]
})

print(" MODEL COMPARISON:")
display(results)

In [None]:
# Select champion model (best PR-AUC)
champion_idx = results['Val_PR_AUC'].idxmax()
champion_name = results.loc[champion_idx, 'Model']

models = {
    'Logistic Regression': lr_best,
    'XGBoost': xgb_best,
    'LightGBM': lgb_best
}

champion_model = models[champion_name]

print(f"\n CHAMPION MODEL: {champion_name}")
print(f"   PR-AUC: {results.loc[champion_idx, 'Val_PR_AUC']:.4f}")

## 8. Save Training Artifacts

In [None]:
# Save champion model
joblib.dump(champion_model, MODEL_PATH / 'champion_model.pkl')

# Save test set for evaluation
test_data = pd.DataFrame(X_test, columns=FEATURES)
test_data[TARGET] = y_test.values
test_data.to_csv(MODEL_PATH / 'test_set.csv', index=False)

# Save results
results.to_csv(MODEL_PATH / 'training_results.csv', index=False)

# Save metadata
metadata = {
    'champion_model': champion_name,
    'train_samples': len(X_train),
    'train_samples_smote': len(X_train_smote),
    'features': len(FEATURES),
    'trained_at': datetime.now().isoformat()
}
with open(MODEL_PATH / 'training_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(" Saved:")
print(f"   - champion_model.pkl ({champion_name})")
print(f"   - test_set.csv")
print(f"   - training_results.csv")
print(f"   - training_metadata.json")

In [None]:
print("\n" + "="*60)
print(" MODEL TRAINING COMPLETE")
print("="*60)
print(f"\n Champion: {champion_name}")
print("\n NEXT: Proceed to 07_Evaluation.ipynb")
print("="*60)