In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

In [9]:
RSEED = 50

# Load in data
df = pd.read_csv('../data/train.csv')

In [None]:
# Extract the labels/target
target = df['target']
features = df.drop(columns=['target', "id"])

# splitting the data (30% test size)
x_train, x_test, y_train, y_test = train_test_split(features, target, 
                                                    stratify = target,
                                                    test_size = 0.3, 
                                                    random_state = RSEED)


In [34]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1,
                               class_weight='balanced')

# Fit on training data

model.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   47.2s finished


In [35]:
# lets check what the hell happened and how accurate it is
accuracy = model.score(x_train, y_train)
print(f"Test Accuracy: {accuracy:.3f}")

# RIP classification report, it doesn't work with dataframes, i'm going to cry
# print(classification_report(x_test, y_test)) 

# A test accuracy of 1 is super good, lets see how it does on the test data


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.5s


Test Accuracy: 1.000


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.0s finished


In [None]:
# Lets use the test data to make some predictions and print a report
y_pred = model.predict(x_test)
print("")
print("--- Classification Report ---")
print(classification_report(y_test, y_pred))
print("""
      
--- Confusion Matrix ---
      """)
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



--- Classification Report ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    172056
           1       0.00      0.00      0.00      6508

    accuracy                           0.96    178564
   macro avg       0.48      0.50      0.49    178564
weighted avg       0.93      0.96      0.95    178564



  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
#trying a new model with different hyperparameters

model = RandomForestClassifier(
    n_estimators=300,       # more trees = more stability
    max_depth=10,           # limit tree depth (prevents overfitting)
    min_samples_split=5,    # need at least 5 samples to split
    min_samples_leaf=2,     # leaf must have at least 2 samples
    max_features='sqrt',    # already good
    class_weight='balanced',# if imbalanced
    n_jobs=-1,
    random_state=RSEED)

model.fit(x_train, y_train)

In [47]:
# Lets use the test data to make some predictions and print a report
y_pred = model.predict(x_test)
print("")
print("     --- Random Forest Classification Report ---")
print(classification_report(y_test, y_pred))


from sklearn.dummy import DummyClassifier

# Predict the most frequent class
baseline = DummyClassifier(strategy="most_frequent")
baseline.fit(x_train, y_train)

# Testing the baseline model
y_base_pred = baseline.predict(x_test)
print("""
      --- Baseline Classification Report ---""")
print(classification_report(y_test, y_base_pred, zero_division=0))

print("""
      
--- Random Forest Confusion Matrix ---
      """)
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)



     --- Random Forest Classification Report ---
              precision    recall  f1-score   support

           0       0.97      0.75      0.85    172056
           1       0.06      0.40      0.10      6508

    accuracy                           0.74    178564
   macro avg       0.51      0.58      0.47    178564
weighted avg       0.94      0.74      0.82    178564


      --- Baseline Classification Report ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    172056
           1       0.00      0.00      0.00      6508

    accuracy                           0.96    178564
   macro avg       0.48      0.50      0.49    178564
weighted avg       0.93      0.96      0.95    178564



--- Random Forest Confusion Matrix ---
      
[[129376  42680]
 [  3886   2622]]


In [49]:
# ok this wasn't bad, is there a way to improve it?

from sklearn.metrics import roc_auc_score, f1_score

print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))
print("F1 (minority):", f1_score(y_test, y_pred, pos_label=1))

ROC-AUC: 0.6222804572653966
F1 (minority): 0.10121598147075855
