In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# insert other models you want to use here
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier



In [None]:
# replace filepath here
data = pd.read_csv('after_feature_selection_forward_10.csv', parse_dates=['Date'])
data.head()

FileNotFoundError: ignored

In [None]:
train_test = data[(data.Date >= '2010-01-15') & (data.Date <= '2010-10-31')].drop(columns=['Date'])
OOT = data[data.Date > '2010-10-31'].drop(columns=['Date'])

In [None]:
#normal test
#X = train_test.drop(columns=['Fraud', 'Amount'])
#y = train_test['Fraud']
#X_OOT = OOT.drop(columns=['Fraud', 'Amount'])
#y_OOT = OOT['Fraud']

In [None]:
# random forest test (with fewer variables)
X = train_test.iloc[:,0:8]
y = train_test['Fraud']
X_OOT = OOT.iloc[:,0:8]
y_OOT = OOT['Fraud']

In [None]:
def get_fdr(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    temp = pd.DataFrame({'y': y, 'y_pred': y_pred})
    actual_pos = y.sum()
    temp = temp.sort_values('y_pred', ascending=False)
    target_num = int(temp.shape[0] * 0.03)
    temp = temp.iloc[:target_num, :]
    detect_pos = temp['y'].sum()
    fdr = detect_pos / actual_pos
    return fdr

### Randomized Search

In [None]:
# random forest randomized search
rf = RandomForestClassifier(n_jobs=1)

parameters = {'n_estimators': range(50, 110, 10),
              'max_depth': range(500, 2000, 100),
              'max_features': range(1, 10, 1),
              'min_samples_leaf':  range(1, 10, 1),
              'min_samples_split': range(2, 10, 1),
              'criterion': ["gini", "entropy"]}

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
searcher_rf = RandomizedSearchCV(rf, parameters, scoring=get_fdr, n_jobs=2, random_state=0, cv=10)
searcher_rf.fit(X_scaled, y)

print(searcher_rf.best_params_)
print(searcher_rf.best_score_)

{'n_estimators': 60, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 2, 'max_depth': 1500, 'criterion': 'entropy'}
0.7422614274258219


### Modeling

In [None]:
# Whether you want to resample or not
smote_flag = True

fdr_scores = pd.DataFrame(index=range(10), columns=['train fdr', 'test fdr', 'oot fdr'])
for i in range(10):
    # Use your own model and hyperparameters here
    model = RandomForestClassifier(n_estimators=80, criterion="gini", max_depth=1000,
                                  min_samples_split=10, min_samples_leaf=30)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    X_OOT_scaled = pd.DataFrame(scaler.transform(X_OOT), columns=X_OOT.columns)
    
    #Capped 6
    drop_index = ((X_train_scaled >= 6) | (X_train_scaled <= -6)).sum(axis=1)
    drop_index = drop_index[drop_index >= 1].index.to_list()
    X_train_scaled = X_train_scaled.drop(index=drop_index)
    y_train = pd.DataFrame(y_train).iloc[X_train_scaled.index, :]['Fraud']
    
    if smote_flag:
        y_neg_cnt = (y_train == 0).sum()
        smote = SMOTE(sampling_strategy={0: y_neg_cnt, 1: int(y_neg_cnt/10)})
        X_train_smo, y_train_smo = smote.fit_sample(X_train_scaled, y_train)
        model.fit(X_train_smo, y_train_smo)
    else:
        model.fit(X_train_scaled, y_train)
        
    fdr_scores.loc[i, 'train fdr'] = get_fdr(model, X_train_scaled, y_train)
    fdr_scores.loc[i, 'test fdr'] = get_fdr(model, X_test_scaled, y_test)
    fdr_scores.loc[i, 'oot fdr'] = get_fdr(model, X_OOT_scaled, y_OOT)
    
fdr_scores

train fdr    0.914810
test fdr     0.851152
oot fdr      0.566480
dtype: float64