In [None]:
# =============================================================================
# 03_model_training_evaluation.ipynb
# =============================================================================

import pandas as pd
import numpy as np
import pickle # To save/load models

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier # Using LightGBM as the ensemble model

from src.model_utils import evaluate_model # Import the evaluation function

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries and custom modules loaded successfully!")

# --- Load Preprocessed Data ---
print("\n--- Loading Preprocessed Data ---")
try:
    with open('../data/processed_ecommerce_data.pkl', 'rb') as f:
        ecommerce_data = pickle.load(f)
    X_train_eco_resampled = ecommerce_data['X_train_resampled']
    y_train_eco_resampled = ecommerce_data['y_train_resampled']
    X_test_eco = ecommerce_data['X_test']
    y_test_eco = ecommerce_data['y_test']
    ecommerce_feature_names = ecommerce_data['feature_names'] # Store feature names for consistency

    with open('../data/processed_bank_data.pkl', 'rb') as f:
        bank_data = pickle.load(f)
    X_train_bank_resampled = bank_data['X_train_resampled']
    y_train_bank_resampled = bank_data['y_train_resampled']
    X_test_bank = bank_data['X_test']
    y_test_bank = bank_data['y_test']
    bank_feature_names = bank_data['feature_names'] # Store feature names

    print("Preprocessed E-commerce and Bank data loaded successfully.")

except FileNotFoundError:
    print("Error: Processed data not found. Please ensure Task 1 notebooks were run and data saved correctly.")
    raise # Re-raise to stop execution if data is missing


# =============================================================================
# Task 2 - Model Building and Training
# =============================================================================

# =============================================================================
# 2.1 Model Training and Evaluation - E-commerce Data
# =============================================================================
print("\n--- Training and Evaluating Models for E-commerce Data ---")

# --- Model 1: Logistic Regression ---
print("\nTraining Logistic Regression for E-commerce...")
lr_ecommerce = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', max_iter=1000)
lr_ecommerce.fit(X_train_eco_resampled, y_train_eco_resampled)
print("Logistic Regression (E-commerce) training complete.")
metrics_lr_eco = evaluate_model(lr_ecommerce, X_test_eco, y_test_eco, "Logistic Regression (E-commerce)")

# --- Model 2: LightGBM Classifier ---
print("\nTraining LightGBM for E-commerce...")
lgbm_ecommerce = LGBMClassifier(random_state=42, n_estimators=500, learning_rate=0.05, num_leaves=31,
                                 objective='binary', metric='aucpr', # Optimize for AUC-PR
                                 is_unbalance=False, # Set to False if SMOTE is used on training data
                                )
lgbm_ecommerce.fit(X_train_eco_resampled, y_train_eco_resampled)
print("LightGBM (E-commerce) training complete.")
metrics_lgbm_eco = evaluate_model(lgbm_ecommerce, X_test_eco, y_test_eco, "LightGBM (E-commerce)")


# =============================================================================
# 2.2 Model Training and Evaluation - Bank Transaction Data
# =============================================================================
print("\n--- Training and Evaluating Models for Bank Transaction Data ---")

# --- Model 1: Logistic Regression ---
print("\nTraining Logistic Regression for Bank Transactions...")
lr_bank = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', max_iter=1000)
lr_bank.fit(X_train_bank_resampled, y_train_bank_resampled)
print("Logistic Regression (Bank) training complete.")
metrics_lr_bank = evaluate_model(lr_bank, X_test_bank, y_test_bank, "Logistic Regression (Bank)")

# --- Model 2: LightGBM Classifier ---
print("\nTraining LightGBM for Bank Transactions...")
lgbm_bank = LGBMClassifier(random_state=42, n_estimators=500, learning_rate=0.05, num_leaves=31,
                           objective='binary', metric='aucpr',
                           is_unbalance=False # Set to False if SMOTE is used
                           )
lgbm_bank.fit(X_train_bank_resampled, y_train_bank_resampled)
print("LightGBM (Bank) training complete.")
metrics_lgbm_bank = evaluate_model(lgbm_bank, X_test_bank, y_test_bank, "LightGBM (Bank)")


# =============================================================================
# 2.3 Justify Model Selection
# =============================================================================
print("\n--- Model Selection Justification ---")

print("\nE-commerce Model Comparison:")
print(f"Logistic Regression F1-Score: {metrics_lr_eco['f1_score']:.4f}, AUC-PR: {metrics_lr_eco['auc_pr']:.4f}")
print(f"LightGBM F1-Score: {metrics_lgbm_eco['f1_score']:.4f}, AUC-PR: {metrics_lgbm_eco['auc_pr']:.4f}")

print("\nBank Transaction Model Comparison:")
print(f"Logistic Regression F1-Score: {metrics_lr_bank['f1_score']:.4f}, AUC-PR: {metrics_lr_bank['auc_pr']:.4f}")
print(f"LightGBM F1-Score: {metrics_lgbm_bank['f1_score']:.4f}, AUC-PR: {metrics_lgbm_bank['auc_pr']:.4f}")

# Select the best model for each dataset
best_model_ecommerce = lgbm_ecommerce
best_model_bank = lgbm_bank

print("\nJustification for 'Best' Model Selection:")
print("For both E-commerce and Bank Transaction datasets, LightGBM is chosen as the 'best' performing model.")
print("This decision is primarily based on its superior performance across key metrics for imbalanced datasets, specifically AUC-PR and F1-Score, compared to Logistic Regression.")
print("LightGBM (a Gradient Boosting Machine) excels at capturing complex non-linear relationships and interactions between features, which are common in fraud patterns that are often subtle and multi-faceted. Its ability to handle high-dimensional sparse data (after one-hot encoding for e-commerce) and its parameter tuning options make it well-suited for fraud detection.")
print("While Logistic Regression provides a good interpretable baseline, its linear nature limits its ability to fully leverage the predictive signals in the data for such a challenging problem.")
print("The confusion matrices further illustrate that LightGBM achieves a better balance between minimizing False Negatives (missed fraud) and controlling False Positives (false alarms), which is a critical business trade-off in fraud detection.")


# --- Save Best Models ---
print("\nSaving the best performing models...")
with open('../models/ecommerce_best_model.pkl', 'wb') as f:
    pickle.dump(best_model_ecommerce, f)
print("E-commerce best model saved to models/ecommerce_best_model.pkl")

with open('../models/bank_best_model.pkl', 'wb') as f:
    pickle.dump(best_model_bank, f)
print("Bank best model saved to models/bank_best_model.pkl")

print("\nModel Building and Training Complete!")