# 03. Training and Test

Train/validate and test different models for each resampling strategy.

## 01. Imports and Settings

In [None]:
# Imports
from catboost import CatBoostClassifier
from IPython.display import display
from libs.hyper_optimization import (
    HyperParamRandomForestClassifier,
    HyperParamXGBoostClassifier,
    HyperParamLightGBMClassifier,
    HyperParamCatBoostClassifier
)  # hyper_optimization.py
from libs.model_evaluation import (
    plot_roc_curve,
    plot_pr_curve,
    train_validate_model,
    test_model,
    plot_feature_importances
)  # model_evaluation.py
from libs.utils import split_X_y, save_object  # utils.py
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import pandas as pd
import warnings


# Ignore warnings
warnings.filterwarnings("ignore")

# Pandas settings
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 150)  # Increase column width

# Default path
DATA_PATH = "data/"
BIN_PATH = "bin/"

## 02. Load Data

In [None]:
# Dictionary to save models and their metrics
resampled = dict()

# Initializing for each resampling technique
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:
    resampled[strategy] = dict()
    resampled[strategy]["data"] = pd.DataFrame()
    
    for clf in ["rf", "xgb", "lgb", "catboost", "xgb_boruta"]:  # For each classifier type...
        resampled[strategy][clf] = {"obj": object(), "train_metrics": {}, "test_metrics": {}}

In [None]:
# # Load training data from distinct resampling techniques
resampled["ONLY_RUS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ONLY_RUS.csv")
resampled["ROS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ROS.csv")
resampled["SMOTENC"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_SMOTENC.csv")
resampled["CTGAN"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_CTGAN.csv")

# Test data
test_df = pd.read_csv(f"{DATA_PATH}test_data.csv")
# Split test data into X and y
X_test, y_test = split_X_y(test_df, "is_target", [])

In [None]:
# Categorial columns
categorical_cols = ["join_s", "sch_s", "sch_r"]
# Numerical columns
numerical_cols = ["adv_r", "adv_s", "data_r", "data_s", "dist_ch_to_bs",
                  "dist_to_ch", "expaned_energy", "rank", "send_code", "who_ch"]
features = categorical_cols + numerical_cols
# Selected features in Boruta
boruta_features = ["adv_s", "expaned_energy", "dist_ch_to_bs", "adv_r", "who_ch", "data_r"]

## 03. Train Models

### 03.1. Random Forest

In [None]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:  # For each resampling strategy...
    print(f"\n>> {strategy}\n")
    # Split into X and y
    X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
    
    # Optimizing hyperparameters...
    model, best_hyperparams, _ = HyperParamRandomForestClassifier(
        X_train, y_train, n_trials=100).run()

    # Cross-Validation with k = 5
    resampled[strategy]["rf"]["train_metrics"] = train_validate_model(model, X_train, y_train)
    # Training model...
    model = RandomForestClassifier(**best_hyperparams)
    model.fit(X_train, y_train)
    # Model testing
    resampled[strategy]["rf"]["test_metrics"] = test_model(model, X_test, y_test)
    print("\n")
    # ROC and PR Curves
    plot_roc_curve(model, X_test, y_test)
    print("\n")
    plot_pr_curve(model, X_test, y_test)
    # Save trained model
    resampled[strategy]["rf"]["obj"] = model
    save_object(resampled[strategy]["rf"]["obj"], f"{BIN_PATH}rf_{strategy}_obj")
    print("\n")

    # Feature Importance
    plot_feature_importances(resampled[strategy]["rf"]["obj"], features, top_n=len(features))
    print("\n\n\n")

### 03.2. XGBoost

In [None]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:  # For each resampling strategy...
    print(f"\n>> {strategy}\n")
    # Split into X and y
    X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
    
    # Optimizing hyperparameters...
    model, best_hyperparams, _ = HyperParamXGBoostClassifier(
        X_train, y_train, n_trials=100).run()

    # Cross-Validation with k = 5
    resampled[strategy]["xgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
    # Training model...
    model = XGBClassifier(**best_hyperparams)
    model.fit(X_train, y_train)
    # Model testing
    resampled[strategy]["xgb"]["test_metrics"] = test_model(model, X_test, y_test)
    print("\n")
    # ROC and PR Curves
    plot_roc_curve(model, X_test, y_test)
    print("\n")
    plot_pr_curve(model, X_test, y_test)
    # Save trained model
    resampled[strategy]["xgb"]["obj"] = model
    save_object(resampled[strategy]["xgb"]["obj"], f"{BIN_PATH}xgb_{strategy}_obj")
    print("\n")

    # Feature Importance
    plot_feature_importances(resampled[strategy]["xgb"]["obj"], features, top_n=len(features))
    print("\n\n\n")

### 03.3. LightGBM

In [None]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:  # For each resampling strategy...
    print(f"\n>> {strategy}\n")
    # Split into X and y
    X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
    
    # Optimizing hyperparameters...
    model, best_hyperparams, _ = HyperParamLightGBMClassifier(
        X_train, y_train, n_trials=100).run()

    # Cross-Validation with k = 5
    resampled[strategy]["lgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
    # Training model...
    model = LGBMClassifier(**best_hyperparams, verbosity=-1)
    model.fit(X_train, y_train)
    # Model testing
    resampled[strategy]["lgb"]["test_metrics"] = test_model(model, X_test, y_test)
    print("\n")
    # ROC and PR Curves
    plot_roc_curve(model, X_test, y_test)
    print("\n")
    plot_pr_curve(model, X_test, y_test)
    # Save trained model
    resampled[strategy]["lgb"]["obj"] = model
    save_object(resampled[strategy]["lgb"]["obj"], f"{BIN_PATH}lgb_{strategy}_obj")
    print("\n")

    # Feature Importance
    plot_feature_importances(resampled[strategy]["lgb"]["obj"], features, top_n=len(features))
    print("\n\n\n")

### 03.4. CatBoost

In [None]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:  # For each resampling strategy...
    print(f"\n>> {strategy}\n")
    # Split into X and y
    X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
    
    # Optimizing hyperparameters...
    model, best_hyperparams, _ = HyperParamCatBoostClassifier(
        X_train, y_train, n_trials=100).run()

    # Cross-Validation with k = 5
    resampled[strategy]["catboost"]["train_metrics"] = train_validate_model(model, X_train, y_train)
    # Training model...
    model = CatBoostClassifier(**best_hyperparams, cat_features=categorical_cols, logging_level="Silent")
    model.fit(X_train, y_train)
    # Model testing
    resampled[strategy]["catboost"]["test_metrics"] = test_model(model, X_test, y_test)
    print("\n")
    # ROC and PR Curves
    plot_roc_curve(model, X_test, y_test)
    print("\n")
    plot_pr_curve(model, X_test, y_test)
    # Save trained model
    resampled[strategy]["catboost"]["obj"] = model
    save_object(resampled[strategy]["catboost"]["obj"], f"{BIN_PATH}catboost_{strategy}_obj")
    print("\n")
    
    # Feature Importance
    plot_feature_importances(resampled[strategy]["catboost"]["obj"], features, top_n=len(features))
    print("\n\n\n")

### 03.5. XGBoost-Boruta

In [None]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:  # For each resampling strategy...
    print(f"\n>> {strategy}\n")
    # Split into X and y
    # Select only Boruta features
    X_train, y_train = split_X_y(resampled[strategy]["data"][boruta_features + ["is_target"]], "is_target", [])
    
    # Optimizing hyperparameters...
    model, best_hyperparams, _ = HyperParamXGBoostClassifier(
        X_train, y_train, n_trials=100).run()

    # Cross-Validation with k = 5
    resampled[strategy]["xgb_boruta"]["train_metrics"] = train_validate_model(model, X_train, y_train)
    # Training model...
    model = XGBClassifier(**best_hyperparams, verbosity=0)
    model.fit(X_train, y_train)
    # Model testing
    # Only Boruta features
    resampled[strategy]["xgb_boruta"]["test_metrics"] = test_model(model, X_test[boruta_features], y_test)
    print("\n")
    # ROC and PR Curves
    plot_roc_curve(model, X_test[boruta_features], y_test)
    print("\n")
    plot_pr_curve(model, X_test[boruta_features], y_test)
    # Save trained model
    resampled[strategy]["xgb_boruta"]["obj"] = model
    save_object(resampled[strategy]["xgb_boruta"]["obj"], f"{BIN_PATH}xgb_boruta_{strategy}_obj")
    print("\n")
    
    # Feature Importance
    plot_feature_importances(resampled[strategy]["xgb_boruta"]["obj"], features, top_n=len(features))
    print("\n\n\n")

## 04. Synthesized Results

In [None]:
results = []  # List to save results

for strategy, models in resampled.items():  # Iterate on strategies and models
    for model_name, model_metrics in models.items():
        if model_name not in ["data"]:
            # Add the metrics dictionaries
            results.append({"strategy": strategy, "model": model_name.upper(),
                            "train_metrics": model_metrics.get("train_metrics", {}),
                            "test_metrics": model_metrics.get("test_metrics", {})})
# DataFrame from list
results_df = pd.DataFrame(results)
# Results...
display(results_df)

In [None]:
# Save results DataFrame
results_df.to_csv("training_results.csv", index=False)