# Modeling
Experiment different algorithms and pick best model

In [61]:
# Imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

In [46]:
# load in split data
X_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_train.npy")
y_train = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_train.npy")
X_val   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_val.npy")
y_val   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_val.npy")
x_test = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_X_test.npy")
y_test   = np.load("/Users/rohith/Desktop/fraud-detection-ml/Data/prep/creditcard_y_test.npy")

print(f"Train shape: {X_train.shape},  Val shape: {X_val.shape}")

Train shape: (192964, 11),  Val shape: (41349, 11)


### Baseline runs

In [52]:
# Dummy
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
print("Dummy AUC:", roc_auc_score(y_val, dummy.predict_proba(X_val)[:,1]))

Dummy AUC: 0.5


In [53]:
# Shallow Tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)
print("Shallow Tree AUC:",
      roc_auc_score(y_val, tree.predict_proba(X_val)[:,1]))


Shallow Tree AUC: 0.6963779748210523


In [58]:
# RandomForrest Classifier
rForrest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)

rForrest.fit(X_train, y_train)

prob = rForrest.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, prob)
print("RandomForest (100 trees) ROC AUC:", round(auc, 4))


RandomForest (100 trees) ROC AUC: 0.9447


In [62]:
# LightGBM
lgbm = LGBMClassifier(
    is_unbalance=True,    
    n_estimators=100,     
    learning_rate=0.1, 
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)

proba = lgbm.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, proba)
print("LightGBM (is_unbalance=True) ROC AUC:", round(auc, 4))

[LightGBM] [Info] Number of positive: 365, number of negative: 192599
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 192964, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001892 -> initscore=-6.268468
[LightGBM] [Info] Start training from score -6.268468
LightGBM (is_unbalance=True) ROC AUC: 0.907




1. Dummy AUC: 0.500  
2. Shallow Tree AUC: 0.696  
3. RandomForest ROC AUC: 0.945  
4. LightGBM ROC AUC: 0.907

### Tuning Random Forrest Classifier

#### Randomized Search CV

#### Optuna