In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from random import seed,sample
import datetime as dt
from datetime import datetime
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler # Undersampling for Unbalanced Data
from imblearn.over_sampling import SMOTE # Oversampling for Unbalanced Data
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler # Data Encoders
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


pd.set_option('display.max_columns', 500)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [4]:
X_train = pd.read_csv('../X_train.csv')
X_test = pd.read_csv('../X_test.csv')
y_train = pd.read_csv('../y_train.csv')
y_test = pd.read_csv('../y_test.csv')

Scale the features below only, we don't scale the hour_of_day since the values are already within a similar range and have a clear numerical interpretation (hours of the day)

In [5]:
from sklearn.preprocessing import StandardScaler

features_to_scale = ["amount(usd)", "lat", "long", "merch_lat", "merch_long", "age"]


# Initialize the StandardScaler and fit/transform on numerical features
scaler = StandardScaler()

# Fit and transform the scaler on the selected features in your training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform the same selected features in your test data using the same scaler
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

X_test_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   category     555719 non-null  object 
 1   amount(usd)  555719 non-null  float64
 2   gender       555719 non-null  object 
 3   state        555719 non-null  object 
 4   lat          555719 non-null  float64
 5   long         555719 non-null  float64
 6   job          555719 non-null  object 
 7   merch_lat    555719 non-null  float64
 8   merch_long   555719 non-null  float64
 9   hour_of_day  555719 non-null  int64  
 10  day_of_week  555719 non-null  object 
 11  age          555719 non-null  float64
dtypes: float64(6), int64(1), object(5)
memory usage: 50.9+ MB


In [6]:
X_test_scaled.head()

Unnamed: 0,category,amount(usd),gender,state,lat,long,job,merch_lat,merch_long,hour_of_day,day_of_week,age
0,health_fitness,0.272269,M,AL,-0.899089,0.267408,Aid worker,-0.849623,0.212847,14,Tuesday,-0.184877
1,misc_pos,-0.414741,F,CA,-0.870119,-2.009787,Civil Service fast streamer,-0.984834,-1.957631,8,Monday,-0.587264
2,shopping_pos,-0.397939,F,SC,-0.888289,0.583966,Research scientist (physical sciences),-0.842206,0.640575,18,Saturday,-0.644748
3,shopping_net,-0.399281,F,MN,1.773167,-0.418025,Applications developer,1.686301,-0.489843,7,Monday,0.447446
4,grocery_pos,0.182959,M,OH,0.445917,0.644181,Building control surveyor,0.264207,0.612831,3,Friday,0.619897


In [7]:
unqiue_state = X_test_scaled['state'].nunique()
unique_category = X_test_scaled['category'].nunique()
unique_job = X_test_scaled['job'].nunique()

print("no of unqiue sates: ", unqiue_state)
print("no of unqiue categories: ", unique_category)
print("no of unqiue jobs: ", unique_job)

no of unqiue sates:  51
no of unqiue categories:  14
no of unqiue jobs:  495


We will then be one hot encoding some of our categorical variables to run a logistic regression model on X and y.

In [8]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder with sparse=False and drop='first'
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the encoder on categorical columns in the training data
X_train_encoded = encoder.fit_transform(X_train_scaled[['category', 'gender', 'day_of_week']])

# Transform the same columns in the test data
X_test_encoded = encoder.transform(X_test_scaled[['category', 'gender', 'day_of_week']])

# Create DataFrames from the encoded arrays with appropriate column names
encoded_columns = encoder.get_feature_names_out(['category', 'gender', 'day_of_week'])
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)

In [9]:
X_train_final = pd.concat([X_train_encoded_df, X_train_scaled[['age', 'lat', 'long', 'amount(usd)']]], axis=1)
X_test_final = pd.concat([X_test_encoded_df, X_test_scaled[['age', 'lat', 'long', 'amount(usd)']]], axis=1)


In [10]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   category     555719 non-null  object 
 1   amount(usd)  555719 non-null  float64
 2   gender       555719 non-null  object 
 3   state        555719 non-null  object 
 4   lat          555719 non-null  float64
 5   long         555719 non-null  float64
 6   job          555719 non-null  object 
 7   merch_lat    555719 non-null  float64
 8   merch_long   555719 non-null  float64
 9   hour_of_day  555719 non-null  int64  
 10  day_of_week  555719 non-null  object 
 11  age          555719 non-null  int64  
dtypes: float64(5), int64(2), object(5)
memory usage: 50.9+ MB


In [11]:
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   category_food_dining     555719 non-null  float64
 1   category_gas_transport   555719 non-null  float64
 2   category_grocery_net     555719 non-null  float64
 3   category_grocery_pos     555719 non-null  float64
 4   category_health_fitness  555719 non-null  float64
 5   category_home            555719 non-null  float64
 6   category_kids_pets       555719 non-null  float64
 7   category_misc_net        555719 non-null  float64
 8   category_misc_pos        555719 non-null  float64
 9   category_personal_care   555719 non-null  float64
 10  category_shopping_net    555719 non-null  float64
 11  category_shopping_pos    555719 non-null  float64
 12  category_travel          555719 non-null  float64
 13  gender_M                 555719 non-null  float64
 14  day_

Random Forest (Baseline Model)

In [12]:
#We are going to ensure that we have the same splits of the data every time. 
#We can ensure this by creating a KFold object, kf, and passing cv=kf instead of the more common cv=5.

kf = StratifiedKFold(n_splits=5, shuffle=False)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)



In [13]:
score = cross_val_score(rf_classifier, X_train_final, y_train, cv=kf, scoring='recall')
print("Cross Validation Recall scores are: {}".format(score))
print("Average Cross Validation Recall score: {}".format(score.mean()))

Cross Validation Recall scores are: [0.65136936 0.6565507  0.62250185 0.66913397 0.63091716]
Average Cross Validation Recall score: 0.6460946088586583


In [14]:
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train_final, y_train)

In [15]:
y_pred = baseline_rf.predict(X_test_final)


In [16]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
cm = confusion_matrix(y_test, y_pred)

base_accuracy = accuracy_score(y_test, y_pred)
base_precision = precision_score(y_test, y_pred)
base_recall = recall_score(y_test, y_pred)
base_f1 = f1_score(y_test, y_pred)
base_roc_auc = roc_auc_score(y_test, y_pred)

print(cm)

[[552474    350]
 [   987   1908]]


In [17]:
ndf = [(base_recall, base_precision, base_f1, base_accuracy, base_roc_auc)]

rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
rf_score.insert(0, 'Random Forest with', 'No Under/Oversampling')
rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,No Under/Oversampling,0.659067,0.844996,0.740539,0.997594,0.829217


The base model (no sampling) for Random Forest seems to be a lot better than the Log Regression model. There is a higher F1 score, precision and recall score.

Random Forest with Random Over Sampler

In [18]:
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
ros = RandomOverSampler(random_state=42)

In [19]:
# fit and apply the transform
X_over, y_over = ros.fit_resample(X_train_final, y_train)

Training Model

In [20]:
from imblearn.pipeline import Pipeline, make_pipeline

random_overs_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier(n_estimators=100, random_state=13))
#cross_val_score(random_overs_pipeline, X_train, y_train, scoring='recall', cv=kf)

In [21]:
score2 = cross_val_score(random_overs_pipeline, X_train_final, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score2))
print("Average Cross Validation Recall score: {}".format(score2.mean()))

Cross Validation Recall Scores are: [0.68911917 0.70392302 0.67283494 0.70836417 0.67011834]
Average Cross Validation Recall score: 0.6888719291867956


In [22]:
print('Genuine:', y_over.value_counts()[0], '/', round(y_over.value_counts()[0]/len(y_over) * 100,2), '% of the dataset')
print('Frauds:', y_over.value_counts()[1], '/',round(y_over.value_counts()[1]/len(y_over) * 100,2), '% of the dataset')

Genuine: 1289919 / 50.0 % of the dataset
Frauds: 1289919 / 50.0 % of the dataset


In [23]:
random_overs_pipeline.fit(X_over, y_over)

In [24]:
y_pred_oversampled = random_overs_pipeline.predict(X_test_final)

In [25]:
cm = confusion_matrix(y_test, y_pred_oversampled)

over_rf_Recall = recall_score(y_test, y_pred_oversampled)
over_rf_Precision = precision_score(y_test, y_pred_oversampled)
over_rf_f1 = f1_score(y_test, y_pred_oversampled)
over_rf_accuracy = accuracy_score(y_test, y_pred_oversampled)
over_roc = roc_auc_score(y_test, y_pred_oversampled)

print(cm)

[[552272    552]
 [   844   2051]]


In [26]:
ndf = [(over_rf_Recall, over_rf_Precision, over_rf_f1, over_rf_accuracy, over_roc)]

over_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
over_rf_score.insert(0, 'Random Forest with', 'Random Oversampling')
over_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Random Oversampling,0.708463,0.787937,0.746089,0.997488,0.853732


Compared to the base model, we can see that random oversampling + RF model has improved compared to the baseline model, especially in terms of recall. Recall is important for fraud as it measures the model's ability to correctly identify true positive cases (fraud cases).

GridSearch for Oversampling

In [27]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10, 12],
    'random_state': [13]
}

In [28]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_over_rf = GridSearchCV(random_overs_pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_over_rf.fit(X_train_final, y_train)

In [29]:
print('Best parameters:', grid_over_rf.best_params_)
print('Best score:', grid_over_rf.best_score_)

Best parameters: {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__random_state': 13}
Best score: 0.8802568993382067


In [30]:
y_pred_over2 = grid_over_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test_final)

In [31]:
cm = confusion_matrix(y_test, y_pred_over2)

over2_rf_Recall = recall_score(y_test, y_pred_over2)
over2_rf_Precision = precision_score(y_test, y_pred_over2)
over2_rf_f1 = f1_score(y_test, y_pred_over2)
over2_rf_accuracy = accuracy_score(y_test, y_pred_over2)
over2_rf_roc = roc_auc_score(y_test, y_pred_over2)

print(cm)

[[546182   6642]
 [   321   2574]]


In [32]:
ndf = [(over2_rf_Recall, over2_rf_Precision, over2_rf_f1, over2_rf_accuracy, over2_rf_roc)]

over2_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
over2_rf_score.insert(0, 'Random Forest with', 'Random Oversampling using GridSearch')
over2_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Random Oversampling using GridSearch,0.889119,0.279297,0.425068,0.98747,0.938552


Random Undersampling

In [33]:
from imblearn.under_sampling import RandomUnderSampler
# define oversampling strategy
rus = RandomUnderSampler(random_state=42)

In [34]:
# fit and apply the transform
X_under, y_under = rus.fit_resample(X_train_final, y_train)

In [35]:
print('Genuine:', y_under.value_counts()[0], '/', round(y_under.value_counts()[0]/len(y_under) * 100,2), '% of the dataset')
print('Frauds:', y_under.value_counts()[1], '/',round(y_under.value_counts()[1]/len(y_under) * 100,2), '% of the dataset')

Genuine: 6756 / 50.0 % of the dataset
Frauds: 6756 / 50.0 % of the dataset


Training Model

In [36]:
from imblearn.pipeline import Pipeline, make_pipeline

random_unders_pipeline = make_pipeline(RandomUnderSampler(random_state=42), 
                              RandomForestClassifier(n_estimators=100, random_state=13))


In [37]:
score3 = cross_val_score(random_unders_pipeline, X_train_final, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score3))
print("Average Cross Validation Recall score: {}".format(score3.mean()))

Cross Validation Recall Scores are: [0.94300518 0.9511473  0.96299038 0.96225019 0.94230769]
Average Cross Validation Recall score: 0.9523401468997325


In [38]:
random_unders_pipeline.fit(X_under, y_under)

In [39]:
y_pred_undersampled = random_unders_pipeline.predict(X_test_final)

In [40]:
cm = confusion_matrix(y_test, y_pred_undersampled)

under_rf_Recall = recall_score(y_test, y_pred_undersampled)
under_rf_Precision = precision_score(y_test, y_pred_undersampled)
under_rf_f1 = f1_score(y_test, y_pred_undersampled)
under_rf_accuracy = accuracy_score(y_test, y_pred_undersampled)
under_roc = roc_auc_score(y_test, y_pred_undersampled)

print(cm)

[[534931  17893]
 [   116   2779]]


In [41]:
ndf = [(under_rf_Recall, under_rf_Precision, under_rf_f1, under_rf_accuracy, under_roc)]

under_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
under_rf_score.insert(0, 'Random Forest with', 'Random Undersampling')
under_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Random Undersampling,0.959931,0.134433,0.235838,0.967593,0.963782


Random undersampling has significantly improved recall; however, it has led to a drop in precision. It's essential to balance between precision and recall, as a model with high recall and low precision may flag too many false positives. 

GridSearch for RandomUndersampling

In [69]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_under2_rf = GridSearchCV(random_unders_pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_under2_rf.fit(X_train_final, y_train)

In [70]:
print('Best parameters:', grid_under2_rf.best_params_)
print('Best score:', grid_under2_rf.best_score_)

Best parameters: {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__random_state': 13}
Best score: 0.9117847178728008


In [71]:
y_pred_under2 = grid_under2_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test_final)

In [72]:
cm = confusion_matrix(y_test, y_pred_under2)

under2_rf_Recall = recall_score(y_test, y_pred_under2)
under2_rf_Precision = precision_score(y_test, y_pred_under2)
under2_rf_f1 = f1_score(y_test, y_pred_under2)
under2_rf_accuracy = accuracy_score(y_test, y_pred_under2)
under2_rf_roc = roc_auc_score(y_test, y_pred_under2)

print(cm)

[[542188  10636]
 [   267   2628]]


In [73]:
ndf = [(under2_rf_Recall, under2_rf_Precision, under2_rf_f1, under2_rf_accuracy, under2_rf_roc)]

under2_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
under2_rf_score.insert(0, 'Random Forest with', 'Random undersampling using GridSearch')
under2_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Random undersampling using GridSearch,0.907772,0.19813,0.325268,0.98038,0.944266


SMOTE

In [42]:
from imblearn.over_sampling import SMOTE

smote_pipeline = make_pipeline(SMOTE(random_state=42), 
                              RandomForestClassifier(n_estimators=100, random_state=13))


In [43]:
score4 = cross_val_score(smote_pipeline, X_train_final, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score4))
print("Average Cross Validation Recall score: {}".format(score4.mean()))


Cross Validation Recall Scores are: [0.77646188 0.76239822 0.73871207 0.78756477 0.75887574]
Average Cross Validation Recall score: 0.7648025350496455


In [44]:
# Fit the model to the training data
smote_pipeline.fit(X_train_final, y_train)

In [45]:
y_pred_smote = smote_pipeline.predict(X_test_final)

In [46]:
cm = confusion_matrix(y_test, y_pred_smote)

smote_rf_Recall = recall_score(y_test, y_pred_smote)
smote_rf_Precision = precision_score(y_test, y_pred_smote)
smote_rf_f1 = f1_score(y_test, y_pred_smote)
smote_rf_accuracy = accuracy_score(y_test, y_pred_smote)
smote_rf_rocauc = roc_auc_score(y_test, y_pred_smote)

print(cm)

[[551525   1299]
 [   638   2257]]


In [47]:
ndf = [(smote_rf_Recall, smote_rf_Precision, smote_rf_f1, smote_rf_accuracy, smote_rf_rocauc)]

smote_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
smote_rf_score.insert(0, 'Random Forest with', 'SMOTE Oversampling')
smote_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,SMOTE Oversampling,0.77962,0.634702,0.699736,0.996514,0.888635


GridSearch for SMOTE

In [48]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
smote2_rf = GridSearchCV(smote_pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
smote2_rf.fit(X_train_final, y_train)

In [49]:
print('Best parameters:', smote2_rf.best_params_)
print('Best score:', smote2_rf.best_score_)

Best parameters: {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__random_state': 13}
Best score: 0.8752243571494269


In [50]:
y_pred_smote2 = smote2_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test_final)

In [51]:
cm = confusion_matrix(y_test, y_pred_smote2)

smote2_rf_Recall = recall_score(y_test, y_pred_smote2)
smote2_rf_Precision = precision_score(y_test, y_pred_smote2)
smote2_rf_f1 = f1_score(y_test, y_pred_smote2)
smote2_rf_accuracy = accuracy_score(y_test, y_pred_smote2)
smote2_rf_rocauc = roc_auc_score(y_test, y_pred_smote2)

print(cm)

[[546571   6253]
 [   335   2560]]


In [52]:
ndf = [(smote2_rf_Recall, smote2_rf_Precision, smote2_rf_f1, smote2_rf_accuracy, smote2_rf_rocauc)]

smote2_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
smote2_rf_score.insert(0, 'Random Forest with', 'SMOTE Oversampling using GridSearch')
smote2_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,SMOTE Oversampling using GridSearch,0.884283,0.29048,0.437308,0.988145,0.936486


Class Weights

In [53]:
rfb = RandomForestClassifier(n_estimators=100, random_state=13, class_weight="balanced")

In [54]:
score5 = cross_val_score(rfb, X_train_final, y_train, cv=kf, scoring='recall')
print("Cross Validation Recall scores are: {}".format(score5))
print("Average Cross Validation Recall score: {}".format(score5.mean()))

Cross Validation Recall scores are: [0.62990377 0.64248705 0.60547742 0.65136936 0.62130178]
Average Cross Validation Recall score: 0.6301078753848781


In [55]:
rfb.fit(X_train_final, y_train)

In [56]:
y_pred_balanced = rfb.predict(X_test_final)


In [57]:
cm = confusion_matrix(y_test, y_pred_balanced)

balaned_rf_Recall = recall_score(y_test, y_pred_balanced)
balaned_rf_Precision = precision_score(y_test, y_pred_balanced)
balaned_rf_f1 = f1_score(y_test, y_pred_balanced)
balaned_rf_accuracy = accuracy_score(y_test, y_pred_balanced)
balaned_rf_rocauc = roc_auc_score(y_test, y_pred_balanced)

print(cm)

[[552505    319]
 [  1008   1887]]


In [58]:
ndf = [(balaned_rf_Recall, balaned_rf_Precision, balaned_rf_f1, balaned_rf_accuracy, balaned_rf_rocauc)]

balanced_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
balanced_rf_score.insert(0, 'Random Forest with', 'Balanced Class Weights')
balanced_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Balanced Class Weights,0.651813,0.855394,0.739855,0.997612,0.825618


GridSearch for Random Class Weights

In [59]:
balanced_rf_pipeline = Pipeline([
    ('randomforestclassifier', RandomForestClassifier(random_state=13, class_weight="balanced"))
])

In [60]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=balanced_rf_pipeline, param_grid=new_params, cv=3, scoring='recall')

# Fit the model
grid_search.fit(X_train_final, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_balanced_rf_model = grid_search.best_estimator_

# You can use the best model for predictions on your test data
y_pred = best_balanced_rf_model.predict(X_test_final)

Best Parameters: {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__random_state': 13}


In [61]:
cm = confusion_matrix(y_test, y_pred)

balanced2_Recall = recall_score(y_test, y_pred)
balanced2_Precision = precision_score(y_test, y_pred)
balanced2_f1 = f1_score(y_test, y_pred)
balanced2_accuracy = accuracy_score(y_test, y_pred)
balanced2_roc = roc_auc_score(y_test, y_pred)

print(cm)

[[547250   5574]
 [   369   2526]]


In [67]:
ndf = [(balanced2_Recall, balanced2_Precision, balanced2_f1, balanced2_accuracy, balanced2_roc)]

balanced2_rf_score = pd.DataFrame(data = ndf, columns=['Recall','Precision','F1 Score', 'Accuracy', 'ROC_AUC'])
balanced2_rf_score.insert(0, 'Random Forest with', 'Balanced Class Weights using GridSearch')
balanced2_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
0,Balanced Class Weights using GridSearch,0.872539,0.311852,0.459482,0.989306,0.931228


Performance Comparison

In [75]:
predictions = pd.concat([rf_score, under2_rf_score, smote2_rf_score, over2_rf_score, balanced2_rf_score], ignore_index=True, sort=False)
predictions.sort_values(by=['Recall'], ascending=False)

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy,ROC_AUC
1,Random undersampling using GridSearch,0.907772,0.19813,0.325268,0.98038,0.944266
3,Random Oversampling using GridSearch,0.889119,0.279297,0.425068,0.98747,0.938552
2,SMOTE Oversampling using GridSearch,0.884283,0.29048,0.437308,0.988145,0.936486
4,Balanced Class Weights using GridSearch,0.872539,0.311852,0.459482,0.989306,0.931228
0,No Under/Oversampling,0.659067,0.844996,0.740539,0.997594,0.829217
