In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
data = pd.read_csv("../data/synthetic_real_data.csv")
test_data = pd.read_csv("../data/test_modified.csv")

In [8]:
X = data.iloc[:,:-1]
y = data.Fault_class

In [16]:
from sklearn.metrics import roc_auc_score

def calculate_auc_score(ground_truth_targets, predicted_probabilities):
    """
    Calculate the average Area Under the ROC Curve (AUC) score for multiple defect categories.
    
    Parameters:
        predicted_probabilities (numpy.ndarray): Predicted probabilities for each defect category.
                                                 Shape: (num_samples, num_categories).
        ground_truth_targets (numpy.ndarray): Ground truth targets for each defect category.
                                               Shape: (num_samples, num_categories).
    
    Returns:
        float: Average AUC score.
    """
    num_categories = predicted_probabilities.shape[1]
    auc_scores = []
    
    for i in range(num_categories):
        auc = roc_auc_score(ground_truth_targets[:, i], predicted_probabilities[:, i])
        auc_scores.append(auc)
    
    average_auc = np.mean(auc_scores)
    return average_auc


In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)

In [40]:
from sklearn.model_selection import GridSearchCV

In [43]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],     # Minimum number of samples required at each leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]         # Whether bootstrap samples are used when building trees
}

# Create Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2.5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.604 total time=  13.3s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.590 total time=   6.7s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.599 total time=   6.6s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.606 total time=   6.3s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.601 total time=  12.7s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.604 total time=   6.0s
[CV 4/5] 

[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.600 total time=  13.2s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.596 total time=   6.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.605 total time=   6.2s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.597 total time=  12.4s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.591 total time=   6.3s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.593 total time=   6.3s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, m

[CV 2/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=50;, score=0.587 total time=   3.3s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=50;, score=0.592 total time=   3.5s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100;, score=0.581 total time=   6.8s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.598 total time=   6.7s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.592 total time=   6.8s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.602 total time=  27.2s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, m

[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.598 total time=  23.2s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.603 total time=  23.0s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100;, score=0.602 total time=  11.5s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.587 total time=   5.7s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.603 total time=   5.8s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.593 total time=  11.3s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=log2, min_sam

[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.595 total time=   6.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.607 total time=   6.4s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.597 total time=  12.3s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.588 total time=   6.4s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.585 total time=   6.3s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.602 total time=  12.6s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_

[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.588 total time=   6.0s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.599 total time=  14.1s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.590 total time=   6.6s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.605 total time=  12.6s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.604 total time=  25.1s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.602 total time=  24.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_sam

[CV 4/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=0.597 total time=   4.9s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=0.600 total time=  10.0s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.611 total time=  20.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.617 total time=  19.2s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.600 total time=  19.0s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=0.605 total time=  18.8s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=

[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.594 total time=   6.0s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.612 total time=  14.0s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.591 total time=   6.5s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.595 total time=  12.5s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.606 total time=  25.2s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.607 total time=  24.2s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_sam

[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.597 total time=  13.2s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.613 total time=  26.5s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.616 total time=  25.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.605 total time=  24.2s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.605 total time=  25.0s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.611 total time=  12.5s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_s

[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.604 total time=   6.5s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.595 total time=  12.7s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.594 total time=   6.5s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.586 total time=   6.4s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.603 total time=  12.9s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.603 total time=  25.7s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, m

[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.587 total time=   6.7s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.598 total time=  13.6s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.608 total time=  26.9s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.603 total time=  25.6s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.601 total time=  12.7s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.591 total time=   6.6s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, m

[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.604 total time=   6.6s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.598 total time=  12.8s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.594 total time=   6.4s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.593 total time=   6.4s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.603 total time=  12.9s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.612 total time=  25.6s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min

[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.595 total time=   6.3s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.605 total time=  12.9s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.600 total time=  25.6s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.603 total time=  25.8s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.602 total time=  12.5s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=0.601 total time=   6.2s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2

[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100;, score=0.601 total time=  12.0s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.599 total time=  23.3s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=0.604 total time=  23.1s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.602 total time=  11.8s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.600 total time=   5.6s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.605 total time=   5.6s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, m

[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.598 total time=  13.3s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.612 total time=  27.0s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.614 total time=  25.7s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.601 total time=  25.8s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.609 total time=  25.5s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=0.603 total time=  12.2s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4

[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.608 total time=  25.6s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.609 total time=  25.8s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.607 total time=  12.7s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=0.592 total time=   6.3s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=0.602 total time=  12.4s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.603 total time=  24.4s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=

[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.604 total time=  12.8s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.612 total time=  12.9s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.596 total time=   6.4s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.602 total time=  12.9s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.617 total time=  25.7s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.599 total time=  25.6s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2,

[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.618 total time=  34.9s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.609 total time=  33.2s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.609 total time=  33.1s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100;, score=0.614 total time=  16.5s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.588 total time=   7.9s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.603 total time=  15.7s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_

[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.616 total time=  36.5s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.605 total time=  36.2s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.608 total time=  35.2s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.601 total time=  16.7s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=0.607 total time=   8.4s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=0.611 total time=   8.3s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_le

[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.621 total time=  18.1s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=50;, score=0.601 total time=   8.8s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100;, score=0.607 total time=  17.6s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.613 total time=  35.1s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.611 total time=  16.7s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=0.592 total time=   8.4s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_

[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=50;, score=0.611 total time=   8.3s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.602 total time=  16.8s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=0.600 total time=   8.3s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=0.600 total time=   8.4s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100;, score=0.607 total time=  16.6s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.618 total time=  33.2s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf

[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.600 total time=  16.7s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.600 total time=  33.4s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.603 total time=  33.1s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100;, score=0.603 total time=  16.3s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.599 total time=   7.9s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.612 total time=   7.9s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_le

[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.600 total time=  33.1s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200;, score=0.604 total time=  33.0s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.602 total time=  15.9s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.608 total time=   8.0s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.606 total time=   7.7s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.603 total time=  15.9s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=log2, min_samples_le

[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.616 total time=  18.4s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.601 total time=   9.3s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.605 total time=  17.9s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=0.620 total time=  36.2s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.603 total time=  35.0s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.603 total time=  33.5s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_l

In [45]:
best_params, best_score

({'bootstrap': False,
  'max_depth': 20,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 200},
 0.6112245632991177)

In [47]:
rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)

In [48]:
accuracy = accuracy_score(y_test, rf_clf.predict(X_test))
report = classification_report(y_test, rf_clf.predict(X_test))
print(f"The accuracy of the test data is: {accuracy*100}%")
print("Classifciation report")
print(report)

The accuracy of the test data is: 60.09075194468453%
Classifciation report
              precision    recall  f1-score   support

           0       0.51      0.58      0.55      1437
           1       0.67      0.71      0.69      1149
           2       0.90      0.90      0.90       973
           3       0.79      0.91      0.85      1043
           4       0.55      0.58      0.57      1195
           5       0.49      0.46      0.47      1422
           6       0.44      0.36      0.40      2037

    accuracy                           0.60      9256
   macro avg       0.62      0.64      0.63      9256
weighted avg       0.59      0.60      0.59      9256



In [49]:
ground_truth = np.zeros((X_test.shape[0], len(np.unique(y_test))))
for i in range(X_test.shape[0]):
    ground_truth[i][y_test.to_list()[i]] = 1

In [50]:
print(f"AUC score is: {calculate_auc_score(ground_truth, rf_clf.predict_proba(X_test))}")

AUC score is: 0.8978490279880892


In [51]:
submission = pd.read_csv("../data/submission_basic_model.csv")
submission.iloc[:,1:] = rf_clf.predict_proba(test_data.iloc[:,1:])
submission.to_csv("../data/submission_rf_synthetic_data_grid_search.csv", index=False)