In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

import json
import os

# Load processed cleaned dataset
df = pd.read_csv("../data/processed/processed_returns.csv")

df["Return_Status"] = df["Return_Status"].astype(int)

y = df["Return_Status"]
X = df.drop(columns=["Return_Status"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Ready for Hyperparameter Tuning")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Data Ready for Hyperparameter Tuning


In [2]:
rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier(class_weight="balanced", random_state=42)

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_params,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_scaled, y_train)

print("Best Random Forest Params:", rf_grid.best_params_)
print("Best RF ROC-AUC:", rf_grid.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Random Forest Params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best RF ROC-AUC: 0.9992761787642711


In [3]:
gb_params = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3, 4]
}

gb = GradientBoostingClassifier()

gb_grid = GridSearchCV(
    estimator=gb,
    param_grid=gb_params,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1
)

gb_grid.fit(X_train_scaled, y_train)

print("Best Gradient Boosting Params:", gb_grid.best_params_)
print("Best GB ROC-AUC:", gb_grid.best_score_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Gradient Boosting Params: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200}
Best GB ROC-AUC: 0.9993377036359036


In [4]:
tuning_results = {
    "Random_Forest": {
        "best_params": rf_grid.best_params_,
        "best_score": rf_grid.best_score_
    },
    "Gradient_Boosting": {
        "best_params": gb_grid.best_params_,
        "best_score": gb_grid.best_score_
    }
}

print(tuning_results)


{'Random_Forest': {'best_params': {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, 'best_score': 0.9992761787642711}, 'Gradient_Boosting': {'best_params': {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200}, 'best_score': 0.9993377036359036}}


In [5]:
save_path = "../results/hyperparameter_tuning_results.json"

with open(save_path, "w") as f:
    json.dump(tuning_results, f, indent=4)

print("Hyperparameter tuning results saved to:", save_path)


Hyperparameter tuning results saved to: ../results/hyperparameter_tuning_results.json
