# Stablecoin Depeg Prediction - Model Training & Selection

This notebook covers the complete ML pipeline:
1. Data preparation and feature engineering
2. Train/test split (time series aware)
3. Baseline models
4. Hyperparameter tuning
5. Model evaluation and comparison
6. Feature importance analysis
7. Final model selection and export

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, average_precision_score, roc_curve,
    f1_score, precision_score, recall_score
)

# Project imports
import sys
PROJECT_ROOT = Path('.').resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

from config.settings import PROCESSED_DATA_DIR
from src.features.engineering import create_features, create_target

# Style
plt.style.use('seaborn-v0_8-whitegrid')
colors = {'usdt': '#26A17B', 'usdc': '#2775CA'}

# Random seed
SEED = 42
np.random.seed(SEED)

---
## 1. Configuration

In [None]:
# Model configuration
CONFIG = {
    'threshold': 0.005,      # Depeg threshold (0.5%)
    'horizon_days': 7,       # Prediction horizon
    'cv_splits': 5,          # Time series CV splits
    'use_multi_coin': True,  # Use combined USDT + USDC data
}

print("Model Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

---
## 2. Load and Prepare Data

In [None]:
# Load data
if CONFIG['use_multi_coin']:
    df = pd.read_csv(PROCESSED_DATA_DIR / 'combined_stablecoins_daily.csv')
    print(f"Loaded combined data: {len(df):,} rows")
else:
    df = pd.read_csv(PROCESSED_DATA_DIR / 'usdt_merged_daily.csv')
    df['coin'] = 'usdt'
    print(f"Loaded USDT data: {len(df):,} rows")

df['date'] = pd.to_datetime(df['date'])
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Coins: {df['coin'].unique().tolist()}")

In [None]:
def prepare_features(df, threshold=0.005, horizon_days=7):
    """Create features and target for modeling."""
    
    all_dfs = []
    
    for coin in df['coin'].unique():
        coin_df = df[df['coin'] == coin].copy()
        coin_df = coin_df.sort_values('date').reset_index(drop=True)
        
        # === Price Features ===
        coin_df['btc_return_1d'] = coin_df['close'].pct_change()
        coin_df['btc_return_7d'] = coin_df['close'].pct_change(periods=7)
        coin_df['btc_return_30d'] = coin_df['close'].pct_change(periods=30)
        coin_df['btc_volatility_7d'] = coin_df['btc_return_1d'].rolling(7).std()
        coin_df['btc_volatility_30d'] = coin_df['btc_return_1d'].rolling(30).std()
        
        # BTC drawdown
        coin_df['btc_rolling_max_30d'] = coin_df['close'].rolling(30).max()
        coin_df['btc_drawdown_30d'] = (coin_df['close'] - coin_df['btc_rolling_max_30d']) / coin_df['btc_rolling_max_30d']
        
        # === Volume Features ===
        coin_df['volume_ma_7d'] = coin_df['quote_volume'].rolling(7).mean()
        coin_df['volume_ma_30d'] = coin_df['quote_volume'].rolling(30).mean()
        coin_df['volume_ratio_7d'] = coin_df['quote_volume'] / coin_df['volume_ma_7d']
        coin_df['volume_ratio_30d'] = coin_df['quote_volume'] / coin_df['volume_ma_30d']
        
        # === Volatility Features ===
        coin_df['spread_ma_7d'] = coin_df['spread_proxy'].rolling(7).mean()
        coin_df['spread_ma_30d'] = coin_df['spread_proxy'].rolling(30).mean()
        coin_df['spread_zscore'] = (
            (coin_df['spread_proxy'] - coin_df['spread_ma_30d']) / 
            coin_df['spread_proxy'].rolling(30).std()
        )
        
        # === Supply Features ===
        coin_df['supply_change_1d'] = coin_df['total_circulating'].pct_change()
        coin_df['supply_change_7d'] = coin_df['total_circulating'].pct_change(periods=7)
        coin_df['supply_volatility_7d'] = coin_df['supply_change_1d'].rolling(7).std()
        
        # === Price Deviation Features ===
        coin_df['price_deviation'] = coin_df['implied_price'] - 1.0
        coin_df['abs_deviation'] = coin_df['price_deviation'].abs()
        coin_df['deviation_ma_7d'] = coin_df['price_deviation'].rolling(7).mean()
        
        # === Interaction Features ===
        coin_df['stress_indicator'] = coin_df['spread_zscore'] * coin_df['volume_ratio_7d']
        coin_df['flight_to_safety'] = (-coin_df['btc_return_1d']) * coin_df['supply_change_1d'].clip(lower=0)
        
        # === Target: Will deviation exceed threshold in next N days? ===
        coin_df['future_max_deviation'] = (
            coin_df['abs_deviation']
            .rolling(horizon_days, min_periods=1)
            .max()
            .shift(-horizon_days)
        )
        coin_df['target'] = (coin_df['future_max_deviation'] >= threshold).astype(int)
        
        all_dfs.append(coin_df)
    
    return pd.concat(all_dfs, ignore_index=True)

# Create features
df_features = prepare_features(df, CONFIG['threshold'], CONFIG['horizon_days'])
print(f"\nFeatures created: {df_features.shape}")

In [None]:
# Define feature columns
FEATURE_COLS = [
    # BTC price features
    'btc_return_1d', 'btc_return_7d', 'btc_return_30d',
    'btc_volatility_7d', 'btc_volatility_30d', 'btc_drawdown_30d',
    
    # Volume features
    'volume_ratio_7d', 'volume_ratio_30d',
    
    # Volatility features
    'spread_proxy', 'spread_ma_7d', 'spread_zscore',
    
    # Buy pressure
    'buy_ratio',
    
    # Supply features
    'supply_change_1d', 'supply_change_7d', 'supply_volatility_7d',
    
    # Current deviation
    'price_deviation', 'abs_deviation',
    
    # Interaction features
    'stress_indicator', 'flight_to_safety',
]

# Add Fear & Greed if available
if 'fear_greed_value' in df_features.columns:
    FEATURE_COLS.append('fear_greed_value')

# Add coin dummy for multi-coin
if CONFIG['use_multi_coin']:
    df_features['is_usdc'] = (df_features['coin'] == 'usdc').astype(int)
    FEATURE_COLS.append('is_usdc')

print(f"Feature columns ({len(FEATURE_COLS)}):")
print(FEATURE_COLS)

In [None]:
# Clean data - remove rows with NaN in features or target
df_clean = df_features.dropna(subset=FEATURE_COLS + ['target']).copy()

# Handle infinite values
df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna(subset=FEATURE_COLS)

X = df_clean[FEATURE_COLS]
y = df_clean['target']

print(f"\nFinal dataset:")
print(f"  Samples: {len(X):,}")
print(f"  Features: {X.shape[1]}")
print(f"  Target distribution:")
print(f"    Class 0 (no depeg): {(y==0).sum():,} ({(y==0).mean()*100:.1f}%)")
print(f"    Class 1 (depeg):    {(y==1).sum():,} ({(y==1).mean()*100:.1f}%)")

---
## 3. Train/Test Split

In [None]:
# Time-based split (last 20% for testing)
split_idx = int(len(X) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Get dates for reference
train_dates = df_clean['date'].iloc[:split_idx]
test_dates = df_clean['date'].iloc[split_idx:]

print("Train/Test Split (Time-based):")
print(f"  Train: {len(X_train):,} samples ({train_dates.min().date()} to {train_dates.max().date()})")
print(f"  Test:  {len(X_test):,} samples ({test_dates.min().date()} to {test_dates.max().date()})")
print(f"\nTrain target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution:  {y_test.value_counts().to_dict()}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled with StandardScaler")

In [None]:
# Time series cross-validation for training
tscv = TimeSeriesSplit(n_splits=CONFIG['cv_splits'])

# Visualize CV splits
fig, ax = plt.subplots(figsize=(12, 4))
for i, (train_idx, val_idx) in enumerate(tscv.split(X_train_scaled)):
    ax.scatter(train_idx, [i]*len(train_idx), c='blue', s=1, label='Train' if i==0 else '')
    ax.scatter(val_idx, [i]*len(val_idx), c='red', s=1, label='Validation' if i==0 else '')

ax.set_xlabel('Sample Index')
ax.set_ylabel('CV Fold')
ax.set_title('Time Series Cross-Validation Splits')
ax.legend()
plt.tight_layout()
plt.show()

---
## 4. Baseline Models

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train model and return evaluation metrics."""
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Metrics
    metrics = {
        'model': model_name,
        'accuracy': (y_pred == y_test).mean(),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
    }
    
    if y_prob is not None and len(np.unique(y_test)) > 1:
        metrics['roc_auc'] = roc_auc_score(y_test, y_prob)
        metrics['avg_precision'] = average_precision_score(y_test, y_prob)
    else:
        metrics['roc_auc'] = np.nan
        metrics['avg_precision'] = np.nan
    
    return metrics, model, y_pred, y_prob

# Store results
results = []
models = {}

In [None]:
# Calculate class weight
class_weight = {0: 1, 1: (y_train == 0).sum() / (y_train == 1).sum()}
print(f"Class weight: {class_weight}")

In [None]:
# Model 1: Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=SEED
)
metrics, model, y_pred, y_prob = evaluate_model(
    lr, X_train_scaled, X_test_scaled, y_train, y_test, 'Logistic Regression'
)
results.append(metrics)
models['lr'] = model
print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Model 2: Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)
metrics, model, y_pred, y_prob = evaluate_model(
    rf, X_train_scaled, X_test_scaled, y_train, y_test, 'Random Forest'
)
results.append(metrics)
models['rf'] = model
print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Model 3: Gradient Boosting
print("Training Gradient Boosting...")
gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=SEED
)
# Use sample weights for class imbalance
sample_weight = np.where(y_train == 1, class_weight[1], 1)
gb.fit(X_train_scaled, y_train, sample_weight=sample_weight)

y_pred = gb.predict(X_test_scaled)
y_prob = gb.predict_proba(X_test_scaled)[:, 1]

metrics = {
    'model': 'Gradient Boosting',
    'accuracy': (y_pred == y_test).mean(),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_prob),
    'avg_precision': average_precision_score(y_test, y_prob)
}
results.append(metrics)
models['gb'] = gb
print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Model 4: SVM
print("Training SVM...")
svm = SVC(
    kernel='rbf',
    class_weight='balanced',
    probability=True,
    random_state=SEED
)
metrics, model, y_pred, y_prob = evaluate_model(
    svm, X_train_scaled, X_test_scaled, y_train, y_test, 'SVM (RBF)'
)
results.append(metrics)
models['svm'] = model
print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Model 5: Neural Network
print("Training Neural Network...")
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    max_iter=500,
    early_stopping=True,
    random_state=SEED
)
metrics, model, y_pred, y_prob = evaluate_model(
    mlp, X_train_scaled, X_test_scaled, y_train, y_test, 'Neural Network'
)
results.append(metrics)
models['mlp'] = model
print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

In [None]:
# Baseline results
results_df = pd.DataFrame(results)
print("\n" + "="*70)
print("BASELINE MODEL COMPARISON")
print("="*70)
display(results_df.round(4))

---
## 5. Hyperparameter Tuning

In [None]:
# Find best baseline model for tuning
best_baseline = results_df.loc[results_df['roc_auc'].idxmax(), 'model']
print(f"Best baseline model: {best_baseline}")
print(f"\nTuning Random Forest and Gradient Boosting...")

In [None]:
# Tune Random Forest
print("\nTuning Random Forest...")

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10],
}

rf_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=SEED, n_jobs=-1),
    rf_params,
    n_iter=20,
    cv=tscv,
    scoring='roc_auc',
    random_state=SEED,
    n_jobs=-1
)

rf_search.fit(X_train_scaled, y_train)

print(f"Best params: {rf_search.best_params_}")
print(f"Best CV ROC-AUC: {rf_search.best_score_:.4f}")

models['rf_tuned'] = rf_search.best_estimator_

In [None]:
# Tune Gradient Boosting
print("\nTuning Gradient Boosting...")

gb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'min_samples_split': [5, 10, 20],
}

gb_search = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=SEED),
    gb_params,
    n_iter=20,
    cv=tscv,
    scoring='roc_auc',
    random_state=SEED,
    n_jobs=-1
)

gb_search.fit(X_train_scaled, y_train, sample_weight=sample_weight)

print(f"Best params: {gb_search.best_params_}")
print(f"Best CV ROC-AUC: {gb_search.best_score_:.4f}")

models['gb_tuned'] = gb_search.best_estimator_

In [None]:
# Evaluate tuned models on test set
tuned_results = []

for name, model in [('RF Tuned', models['rf_tuned']), ('GB Tuned', models['gb_tuned'])]:
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    metrics = {
        'model': name,
        'accuracy': (y_pred == y_test).mean(),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_prob),
        'avg_precision': average_precision_score(y_test, y_prob)
    }
    tuned_results.append(metrics)

# Combine all results
all_results = pd.concat([results_df, pd.DataFrame(tuned_results)], ignore_index=True)
all_results = all_results.sort_values('roc_auc', ascending=False)

print("\n" + "="*70)
print("ALL MODELS COMPARISON (sorted by ROC-AUC)")
print("="*70)
display(all_results.round(4))

---
## 6. Model Evaluation

In [None]:
# Select best model
best_model_name = all_results.iloc[0]['model']
print(f"Best model: {best_model_name}")

# Map to model object
model_map = {
    'Logistic Regression': 'lr',
    'Random Forest': 'rf',
    'Gradient Boosting': 'gb',
    'SVM (RBF)': 'svm',
    'Neural Network': 'mlp',
    'RF Tuned': 'rf_tuned',
    'GB Tuned': 'gb_tuned'
}

best_model = models[model_map[best_model_name]]

# Get predictions
y_pred_best = best_model.predict(X_test_scaled)
y_prob_best = best_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Detailed classification report
print("\n" + "="*60)
print(f"DETAILED EVALUATION: {best_model_name}")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['No Depeg', 'Depeg']))

In [None]:
# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion matrix heatmap
ax1 = axes[0]
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
            xticklabels=['No Depeg', 'Depeg'],
            yticklabels=['No Depeg', 'Depeg'])
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
ax1.set_title(f'Confusion Matrix - {best_model_name}')

# Normalized confusion matrix
ax2 = axes[1]
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues', ax=ax2,
            xticklabels=['No Depeg', 'Depeg'],
            yticklabels=['No Depeg', 'Depeg'])
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title('Normalized Confusion Matrix')

plt.tight_layout()
plt.show()

In [None]:
# ROC and Precision-Recall curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curve
ax1 = axes[0]
for name, model_key in [('RF Tuned', 'rf_tuned'), ('GB Tuned', 'gb_tuned'), ('Logistic Regression', 'lr')]:
    if model_key in models:
        y_prob = models[model_key].predict_proba(X_test_scaled)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc = roc_auc_score(y_test, y_prob)
        ax1.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')

ax1.plot([0, 1], [0, 1], 'k--', label='Random')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curves')
ax1.legend(loc='lower right')

# Precision-Recall Curve
ax2 = axes[1]
for name, model_key in [('RF Tuned', 'rf_tuned'), ('GB Tuned', 'gb_tuned'), ('Logistic Regression', 'lr')]:
    if model_key in models:
        y_prob = models[model_key].predict_proba(X_test_scaled)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        ap = average_precision_score(y_test, y_prob)
        ax2.plot(recall, precision, label=f'{name} (AP={ap:.3f})')

ax2.axhline(y=y_test.mean(), color='k', linestyle='--', label=f'Baseline ({y_test.mean():.3f})')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curves')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Threshold analysis
thresholds = np.arange(0.1, 0.9, 0.05)
threshold_metrics = []

for thresh in thresholds:
    y_pred_thresh = (y_prob_best >= thresh).astype(int)
    threshold_metrics.append({
        'threshold': thresh,
        'precision': precision_score(y_test, y_pred_thresh, zero_division=0),
        'recall': recall_score(y_test, y_pred_thresh, zero_division=0),
        'f1': f1_score(y_test, y_pred_thresh, zero_division=0)
    })

thresh_df = pd.DataFrame(threshold_metrics)

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(thresh_df['threshold'], thresh_df['precision'], 'b-', label='Precision')
ax.plot(thresh_df['threshold'], thresh_df['recall'], 'r-', label='Recall')
ax.plot(thresh_df['threshold'], thresh_df['f1'], 'g-', label='F1')
ax.axvline(x=0.5, color='gray', linestyle='--', alpha=0.5)

# Mark optimal F1 threshold
best_thresh_idx = thresh_df['f1'].idxmax()
best_thresh = thresh_df.loc[best_thresh_idx, 'threshold']
ax.axvline(x=best_thresh, color='green', linestyle=':', label=f'Best F1 @ {best_thresh:.2f}')

ax.set_xlabel('Classification Threshold')
ax.set_ylabel('Score')
ax.set_title('Precision, Recall, F1 vs Classification Threshold')
ax.legend()
plt.tight_layout()
plt.show()

print(f"\nOptimal threshold for F1: {best_thresh:.2f}")
print(f"  Precision: {thresh_df.loc[best_thresh_idx, 'precision']:.4f}")
print(f"  Recall: {thresh_df.loc[best_thresh_idx, 'recall']:.4f}")
print(f"  F1: {thresh_df.loc[best_thresh_idx, 'f1']:.4f}")

---
## 7. Feature Importance

In [None]:
# Get feature importance from tree-based models
if hasattr(best_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': FEATURE_COLS,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(importance['feature'], importance['importance'], color='steelblue')
    ax.set_xlabel('Importance')
    ax.set_title(f'Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Features:")
    print(importance.tail(10).to_string(index=False))
    
elif hasattr(best_model, 'coef_'):
    # For linear models
    importance = pd.DataFrame({
        'feature': FEATURE_COLS,
        'coefficient': best_model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = ['red' if x < 0 else 'green' for x in importance['coefficient']]
    ax.barh(importance['feature'], importance['coefficient'], color=colors)
    ax.set_xlabel('Coefficient')
    ax.set_title(f'Feature Coefficients - {best_model_name}')
    ax.axvline(x=0, color='black', linewidth=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
# Compare feature importance across models
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (name, model_key) in zip(axes, [('Random Forest', 'rf_tuned'), ('Gradient Boosting', 'gb_tuned')]):
    if model_key in models and hasattr(models[model_key], 'feature_importances_'):
        imp = pd.DataFrame({
            'feature': FEATURE_COLS,
            'importance': models[model_key].feature_importances_
        }).sort_values('importance', ascending=True).tail(10)
        
        ax.barh(imp['feature'], imp['importance'], color='steelblue')
        ax.set_xlabel('Importance')
        ax.set_title(f'Top 10 Features - {name}')

plt.tight_layout()
plt.show()

---
## 8. Save Best Model

In [None]:
# Create models directory
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(exist_ok=True)

# Save best model
model_path = MODELS_DIR / 'best_model.joblib'
joblib.dump(best_model, model_path)
print(f"Saved best model to: {model_path}")

# Save scaler
scaler_path = MODELS_DIR / 'scaler.joblib'
joblib.dump(scaler, scaler_path)
print(f"Saved scaler to: {scaler_path}")

# Save feature list
features_path = MODELS_DIR / 'features.txt'
with open(features_path, 'w') as f:
    f.write('\n'.join(FEATURE_COLS))
print(f"Saved feature list to: {features_path}")

# Save model config
config_path = MODELS_DIR / 'config.txt'
with open(config_path, 'w') as f:
    f.write(f"Model: {best_model_name}\n")
    f.write(f"Threshold: {CONFIG['threshold']}\n")
    f.write(f"Horizon: {CONFIG['horizon_days']} days\n")
    f.write(f"Train date range: {train_dates.min().date()} to {train_dates.max().date()}\n")
    f.write(f"Test date range: {test_dates.min().date()} to {test_dates.max().date()}\n")
    f.write(f"Test ROC-AUC: {all_results.iloc[0]['roc_auc']:.4f}\n")
print(f"Saved config to: {config_path}")

In [None]:
# Save all results
results_path = MODELS_DIR / 'model_comparison.csv'
all_results.to_csv(results_path, index=False)
print(f"Saved results to: {results_path}")

---
## 9. Summary

In [None]:
print("="*70)
print("MODEL TRAINING SUMMARY")
print("="*70)

print(f"\nDataset:")
print(f"  Total samples: {len(X):,}")
print(f"  Features: {len(FEATURE_COLS)}")
print(f"  Positive class rate: {y.mean()*100:.2f}%")

print(f"\nConfiguration:")
print(f"  Depeg threshold: {CONFIG['threshold']*100:.1f}%")
print(f"  Prediction horizon: {CONFIG['horizon_days']} days")

print(f"\nBest Model: {best_model_name}")
best_metrics = all_results.iloc[0]
print(f"  ROC-AUC: {best_metrics['roc_auc']:.4f}")
print(f"  Precision: {best_metrics['precision']:.4f}")
print(f"  Recall: {best_metrics['recall']:.4f}")
print(f"  F1 Score: {best_metrics['f1']:.4f}")

print(f"\nFiles saved:")
print(f"  {model_path}")
print(f"  {scaler_path}")
print(f"  {features_path}")
print(f"  {config_path}")
print(f"  {results_path}")

print("\n" + "="*70)
print("Training complete!")
print("="*70)

---
## Appendix: Load and Use Saved Model

In [None]:
# Example: Load and use saved model
# Uncomment to run

# # Load model and scaler
# loaded_model = joblib.load(MODELS_DIR / 'best_model.joblib')
# loaded_scaler = joblib.load(MODELS_DIR / 'scaler.joblib')

# # Load feature list
# with open(MODELS_DIR / 'features.txt', 'r') as f:
#     features = f.read().strip().split('\n')

# # Prepare new data (same feature engineering as training)
# # new_data = ...
# # X_new = new_data[features]
# # X_new_scaled = loaded_scaler.transform(X_new)

# # Predict
# # y_pred = loaded_model.predict(X_new_scaled)
# # y_prob = loaded_model.predict_proba(X_new_scaled)[:, 1]