In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
X_train = pd.read_csv('../data/X_train_transformed.csv')
X_test = pd.read_csv('../data/X_test_transformed.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [3]:
# Normalize the target labels for XGBoost
y_train_normalized = y_train - 1
y_test_normalized = y_test - 1

### RANDOM FOREST CLASSIFIER ###

In [None]:
rf_params = {
    'n_estimators': [150, 350, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train_normalized)

rf_best_model = rf_grid.best_estimator_
rf_best_params = rf_grid.best_params_
rf_accuracy = cross_val_score(rf_best_model, X_test, y_test_normalized, cv=5, scoring='accuracy')

rf_all_results = rf_grid.cv_results_

rf_all_results_summary = {
    'params': rf_all_results['params'],
    'mean_test_score': rf_all_results['mean_test_score'],
    'std_test_score': rf_all_results['std_test_score']
}

rf_results = {
    'Best Parameters': rf_best_params,
    'Mean Accuracy': np.mean(rf_accuracy),
    'Std Accuracy': np.std(rf_accuracy),
    'Individual Accuracy Scores': rf_accuracy.tolist(),
    'All Results Summary': rf_all_results_summary
}

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [None]:
xgb_params = {
    'n_estimators': [150, 350, 500],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2]
}

xgb_model = XGBClassifier(
    tree_method='hist',  
    random_state=42,
    eval_metric='logloss'
)

xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
xgb_grid.fit(X_train, y_train_normalized)

xgb_best_model = xgb_grid.best_estimator_
xgb_best_params = xgb_grid.best_params_
xgb_accuracy = cross_val_score(xgb_best_model, X_test, y_test_normalized, cv=5, scoring='accuracy')

xgb_all_results = xgb_grid.cv_results_

xgb_all_results_summary = {
    'params': xgb_all_results['params'],
    'mean_test_score': xgb_all_results['mean_test_score'],
    'std_test_score': xgb_all_results['std_test_score']
}

xgb_results = {
    'Best Parameters': xgb_best_params,
    'Mean Accuracy': np.mean(xgb_accuracy),
    'Std Accuracy': np.std(xgb_accuracy),
    'Individual Accuracy Scores': xgb_accuracy.tolist(),
    'All Results Summary': xgb_all_results_summary
}

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [8]:
rf_all_results_summary, xgb_all_results_summary

({'params': [{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150},
   {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 350},
   {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 500},
   {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150},
   {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 350},
   {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500},
   {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150},
   {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 350},
   {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500},
   {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150},
   {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 350},
   {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 500},
   {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 150},
   {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 350},
   {'max_depth': 30, 'min_samples_split': 2, 'n_esti

In [9]:
print("Random Forest Results:", rf_all_results_summary)
print("\nXGBoost Results:", xgb_all_results_summary)

Random Forest Results: {'params': [{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}, {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 350}, {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 500}, {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150}, {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 350}, {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}, {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}, {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 350}, {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500}, {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150}, {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 350}, {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 500}, {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 150}, {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 350}, {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}, {'max

In [10]:
df_rf_results_summary = pd.DataFrame(rf_all_results_summary)
df_xgb_results_summary = pd.DataFrame(xgb_all_results_summary)

In [None]:
# rf_results_table = pd.DataFrame([{
#     "Model": "Random Forest",
#     "Best Parameters": rf_results['Best Parameters'],
#     "Mean Accuracy": rf_results['Mean Accuracy'],
#     "Std Accuracy": rf_results['Std Accuracy'],
#     "Individual Accuracy Scores": rf_results['Individual Accuracy Scores']
# }])

# xgb_results_table = pd.DataFrame([{
#     "Model": "XGBoost",
#     "Best Parameters": xgb_results['Best Parameters'],
#     "Mean Accuracy": xgb_results['Mean Accuracy'],
#     "Std Accuracy": xgb_results['Std Accuracy'],
#     "Individual Accuracy Scores": xgb_results['Individual Accuracy Scores']
# }])

# combined_results_table = pd.concat([rf_results_table, xgb_results_table], ignore_index=True)
# combined_results_table


Unnamed: 0,Model,Best Parameters,Mean Accuracy,Std Accuracy,Individual Accuracy Scores
0,Random Forest,"{'max_depth': 30, 'min_samples_split': 5, 'n_e...",0.825979,0.022861,"[0.788546255506608, 0.8546255506607929, 0.8141..."
1,XGBoost,"{'learning_rate': 0.2, 'max_depth': 4, 'n_esti...",0.856002,0.03768,"[0.801762114537445, 0.9162995594713657, 0.8672..."


In [11]:
combined_results_summary = pd.concat([df_rf_results_summary, df_xgb_results_summary], ignore_index=True)
combined_results_summary.to_csv("../data/combined_results_summary.csv", index=False)

In [12]:
df_rf_results_summary

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.870243,0.014212
1,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.872232,0.013467
2,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.872012,0.013586
3,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.867149,0.014355
4,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.871791,0.015043
5,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.872012,0.013794
6,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",0.885275,0.01516
7,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",0.887485,0.015043
8,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",0.887928,0.014896
9,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",0.884832,0.014739
