In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

In [3]:
X_train = pd.read_csv('../data/X_train_transformed.csv')
X_test = pd.read_csv('../data/X_test_transformed.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [4]:
# Normalize the target labels for XGBoost
y_train_normalized = y_train - 1
y_test_normalized = y_test - 1

### RANDOM FOREST CLASSIFIER ###

In [5]:
# parameter tuning
rf_params = {
    'n_estimators': [150, 350, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy', verbose=1)
rf_grid.fit(X_train, y_train_normalized)

rf_best_model = rf_grid.best_estimator_
rf_best_params = rf_grid.best_params_
rf_accuracy = cross_val_score(rf_best_model, X_test, y_test_normalized, cv=5, scoring='accuracy')

rf_results = {
    'Best Parameters': rf_best_params,
    'Mean Accuracy': np.mean(rf_accuracy),
    'Std Accuracy': np.std(rf_accuracy),
    'Individual Accuracy Scores': rf_accuracy.tolist()
}

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [6]:
# XGBoost parameter tuning
xgb_params = {
    'n_estimators': [150, 350, 500],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2]
}

xgb_model = XGBClassifier(
    tree_method='gpu_hist',  
    random_state=42,
    eval_metric='logloss'
)

xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy', verbose=1)
xgb_grid.fit(X_train, y_train_normalized)

xgb_best_model = xgb_grid.best_estimator_
xgb_best_params = xgb_grid.best_params_
xgb_accuracy = cross_val_score(xgb_best_model, X_test, y_test_normalized, cv=5, scoring='accuracy')

xgb_results = {
    'Best Parameters': xgb_best_params,
    'Mean Accuracy': np.mean(xgb_accuracy),
    'Std Accuracy': np.std(xgb_accuracy),
    'Individual Accuracy Scores': xgb_accuracy.tolist(),
}

Fitting 5 folds for each of 27 candidates, totalling 135 fits



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tr

In [15]:
rf_results, xgb_results

({'Best Parameters': {'max_depth': 30,
   'min_samples_split': 5,
   'n_estimators': 350},
  'Mean Accuracy': 0.825979493976843,
  'Std Accuracy': 0.022860667295885016,
  'Individual Accuracy Scores': [0.788546255506608,
   0.8546255506607929,
   0.8141592920353983,
   0.8407079646017699,
   0.831858407079646]},
 {'Best Parameters': {'learning_rate': 0.2,
   'max_depth': 4,
   'n_estimators': 500},
  'Mean Accuracy': 0.8560017153327356,
  'Std Accuracy': 0.03768034965476752,
  'Individual Accuracy Scores': [0.801762114537445,
   0.9162995594713657,
   0.8672566371681416,
   0.8362831858407079,
   0.8584070796460177]})

In [16]:
print("Random Forest Results:", rf_results)
print("\nXGBoost Results:", xgb_results)

Random Forest Results: {'Best Parameters': {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 350}, 'Mean Accuracy': 0.825979493976843, 'Std Accuracy': 0.022860667295885016, 'Individual Accuracy Scores': [0.788546255506608, 0.8546255506607929, 0.8141592920353983, 0.8407079646017699, 0.831858407079646]}

XGBoost Results: {'Best Parameters': {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 500}, 'Mean Accuracy': 0.8560017153327356, 'Std Accuracy': 0.03768034965476752, 'Individual Accuracy Scores': [0.801762114537445, 0.9162995594713657, 0.8672566371681416, 0.8362831858407079, 0.8584070796460177]}


In [10]:
rf_results_table = pd.DataFrame([{
    "Model": "Random Forest",
    "Best Parameters": rf_results['Best Parameters'],
    "Mean Accuracy": rf_results['Mean Accuracy'],
    "Std Accuracy": rf_results['Std Accuracy'],
    "Individual Accuracy Scores": rf_results['Individual Accuracy Scores']
}])

xgb_results_table = pd.DataFrame([{
    "Model": "XGBoost",
    "Best Parameters": xgb_results['Best Parameters'],
    "Mean Accuracy": xgb_results['Mean Accuracy'],
    "Std Accuracy": xgb_results['Std Accuracy'],
    "Individual Accuracy Scores": xgb_results['Individual Accuracy Scores']
}])

combined_results_table = pd.concat([rf_results_table, xgb_results_table], ignore_index=True)
combined_results_table


Unnamed: 0,Model,Best Parameters,Mean Accuracy,Std Accuracy,Individual Accuracy Scores
0,Random Forest,"{'max_depth': 30, 'min_samples_split': 5, 'n_e...",0.825979,0.022861,"[0.788546255506608, 0.8546255506607929, 0.8141..."
1,XGBoost,"{'learning_rate': 0.2, 'max_depth': 4, 'n_esti...",0.856002,0.03768,"[0.801762114537445, 0.9162995594713657, 0.8672..."


In [14]:
combined_results_table.to_csv("../data/model_tuning_results.csv", index=False)