In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

X_train.shape, X_test.shape


((40000, 22), (10000, 22))

In [2]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced_subsample'  # helps a bit with imbalance
)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # for ROC AUC


In [3]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      9700
           1       0.00      0.00      0.00       300

    accuracy                           0.97     10000
   macro avg       0.48      0.50      0.49     10000
weighted avg       0.94      0.97      0.96     10000

Confusion Matrix:
 [[9700    0]
 [ 300    0]]
ROC AUC Score: 0.5331262886597938


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", Counter(y_train_resampled))


Class distribution after SMOTE: Counter({np.int64(0): 38800, np.int64(1): 38800})


In [7]:
rf_model_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_model_smote.fit(X_train_resampled, y_train_resampled)

# Predict again
y_pred_smote = rf_model_smote.predict(X_test)
y_prob_smote = rf_model_smote.predict_proba(X_test)[:, 1]


In [8]:
print("Classification Report (SMOTE):\n", classification_report(y_test, y_pred_smote))
print("Confusion Matrix (SMOTE):\n", confusion_matrix(y_test, y_pred_smote))
print("ROC AUC Score (SMOTE):", roc_auc_score(y_test, y_prob_smote))


Classification Report (SMOTE):
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      9700
           1       0.03      0.03      0.03       300

    accuracy                           0.94     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.94      0.94      0.94     10000

Confusion Matrix (SMOTE):
 [[9409  291]
 [ 290   10]]
ROC AUC Score (SMOTE): 0.5172367697594502
