In [1]:
# 📌 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
import joblib

# Set visualization style
sns.set(style="whitegrid")

# 📌 Step 2: Load Preprocessed Data
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/processed/y_test.csv").values.ravel()

# 📌 Step 3: Apply SMOTE for Balancing Data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"New Training Set Shape After SMOTE: {X_train_resampled.shape}")

# 📌 Step 4: Define Hyperparameter Grid for Random Forest
rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

rf_model = RandomForestClassifier(random_state=42, class_weight="balanced")
grid_rf = GridSearchCV(rf_model, rf_params, cv=3, scoring="f1", n_jobs=-1)
grid_rf.fit(X_train_resampled, y_train_resampled)

# 📌 Step 5: Train the Best Random Forest Model
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("\n🎯 Tuned Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

# 📌 Step 6: Define Hyperparameter Grid for XGBoost
xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}

xgb_model = XGBClassifier(random_state=42, scale_pos_weight=2)
grid_xgb = GridSearchCV(xgb_model, xgb_params, cv=3, scoring="f1", n_jobs=-1)
grid_xgb.fit(X_train_resampled, y_train_resampled)

# 📌 Step 7: Train the Best XGBoost Model
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("\n🔥 Tuned XGBoost Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))

# 📌 Step 8: Save the Best Performing Model
joblib.dump(best_rf, "../models/best_random_forest.pkl")
joblib.dump(best_xgb, "../models/best_xgboost.pkl")

print("\n✅ Hyperparameter Tuning Completed & Best Models Saved!")


New Training Set Shape After SMOTE: (8278, 20)

🎯 Tuned Random Forest Performance:
Accuracy: 0.7736
              precision    recall  f1-score   support

         0.0       0.86      0.83      0.84      1035
         1.0       0.57      0.62      0.59       374

    accuracy                           0.77      1409
   macro avg       0.71      0.72      0.72      1409
weighted avg       0.78      0.77      0.78      1409


🔥 Tuned XGBoost Performance:
Accuracy: 0.7395
              precision    recall  f1-score   support

         0.0       0.90      0.72      0.80      1035
         1.0       0.51      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409


✅ Hyperparameter Tuning Completed & Best Models Saved!
