In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    accuracy_score,
)
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier



In [2]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2, average = 'weighted')
    return f2_score

def my_f2_scorer():
    return make_scorer(f2_func)

Load Data from CSV file


In [3]:
# Import cleaned train and test data
X_train = pd.read_csv('train_X_In-Car-Rec.csv')
y_train = pd.read_csv('train_y_In-Car-Rec.csv')
X_test = pd.read_csv('test_X_In-Car-Rec.csv')
y_test = pd.read_csv('test_y_In-Car-Rec.csv')

In [4]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train.values.ravel())

In [5]:
print("Max Tree depths: ", max(list([t.get_depth() for t in classifier.estimators_])))
print("Max Tree number of leaves: ", max(list([t.get_n_leaves() for t in classifier.estimators_])))



Max Tree depths:  38
Max Tree number of leaves:  3138


## Hyperparamaterizing and Fitting


In [6]:
# Defining hyperparameters for tuning
param_grid = {
    "criterion": ['entropy'],
    "n_estimators": [50, 100, 150, 200, 250, 300],
    "max_depth": [None, 5, 10, 15, 20, 25, 30, 35],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 8, 10],
}
randomized_search = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_grid,
    n_iter=500,
    cv=100,
    scoring=my_f2_scorer(),
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

In [7]:
# Fitting the Model to the training data using the finalized parameters:
randomized_search.fit(X_train, y_train.values.ravel())


# Output the best f2_weighted score
best_f2_weighted_score = randomized_search.best_score_
print(f"The best f2_weighted score from RandomizedSearchCV is: {best_f2_weighted_score:.4f}")

# Output the best parameters
best_parameters = randomized_search.best_params_
print("The best parameters from RandomSearchCV are:")
for param, value in best_parameters.items():
    print(f"{param}: {value}")


Fitting 100 folds for each of 500 candidates, totalling 50000 fits


KeyboardInterrupt: 

In [None]:
plus_minus = 10 # change this to 10-15 when doing a final run. this current value is for testing
increment = 2

param_grid = { 
    'n_estimators': [x for x in range(best_parameters['n_estimators']-plus_minus, best_parameters['n_estimators']+plus_minus,2) if x >= 2],       
    'min_samples_leaf': [x for x in range(best_parameters['min_samples_leaf']-plus_minus , best_parameters['min_samples_leaf']+plus_minus,2) if x > 0],
    'min_impurity_decrease': [x for x in np.arange(best_parameters['min_impurity_decrease']-0.001, best_parameters['min_impurity_decrease']+0.001,.0001).round(5) if x >= 0.000],
    'max_depth': [x for x in range(best_parameters['max_depth']-plus_minus , best_parameters['max_depth']+plus_minus, 2) if x > 1],
    'criterion': [best_parameters['criterion']]
              }

best_grid_search_model = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                                    scoring=my_f2_scorer(), param_grid=param_grid, cv=100, verbose=0,  n_jobs = -1)
_ = best_grid_search_model.fit(X_train, y_train.values.ravel())
# Output the best parameters
best_grid_parameters = _.best_params_
print("The best parameters from GridSearchCV are:")
for param, value in best_grid_parameters.items():
    print(f"{param}: {value}")


### Finalizing Model


In [None]:
# Creating the final pipeline with preprocessing and the classifier
finalmodel = RandomForestClassifier() 
finalmodel.set_params(**best_grid_parameters)

In [None]:
# Train the final pipeline
finalmodel.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = finalmodel.predict(X_test)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision
precision = precision_score(y_test, y_pred, average="weighted")
print(f"\nPrecision (weighted): {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average="weighted")
print(f"Recall (weighted): {recall:.4f}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

f2score = my_f2_scorer(y_test, y_pred)
print(f"F2 Score:  {f2score:.4f}")

Confusion Matrix:
[[164  25  28  26]
 [ 28 176  26  10]
 [ 35  26 159  24]
 [ 17  13  13 230]]

Precision (weighted): 0.7275
Recall (weighted): 0.7290
Accuracy: 0.7290
