In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# insert other models you want to use here
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# replace filepath here
data = pd.read_csv('after_feature_selection_forward_10.csv', parse_dates=['Date'])
data.head()

Unnamed: 0,total_Amount_card_description_7,total_Amount_card_description_3,max_Amount_card_state_14,total_Amount_Cardnum_3,total_Amount_card_description_1,total_Amount_Cardnum_0,total_Amount_card_description_0,total_Amount_Cardnum_14,max_Amount_card_state_30,max_Amount_Cardnum_3,Date,Fraud,Amount
0,293.22,293.22,3.67,438.57,293.22,438.57,293.22,438.57,3.67,3.67,2010-01-01,0,3.62
1,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,2010-01-01,0,31.42
2,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,2010-01-01,0,178.49
3,93.2,93.2,3.8,182.84,93.2,182.84,93.2,182.84,3.8,3.8,2010-01-01,0,3.62
4,293.22,293.22,3.67,438.57,293.22,438.57,293.22,438.57,3.67,3.67,2010-01-01,0,3.62


In [3]:
data.shape

(96397, 13)

In [4]:
train_test = data[(data.Date >= '2010-01-15') & (data.Date <= '2010-10-31')].drop(columns=['Date'])
OOT = data[data.Date > '2010-10-31'].drop(columns=['Date'])

In [5]:
X = train_test.drop(columns=['Fraud', 'Amount'])
y = train_test['Fraud']
X_OOT = OOT.drop(columns=['Fraud', 'Amount'])
y_OOT = OOT['Fraud']

In [6]:
def get_fdr(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    temp = pd.DataFrame({'y': y, 'y_pred': y_pred})
    actual_pos = y.sum()
    temp = temp.sort_values('y_pred', ascending=False)
    target_num = int(temp.shape[0] * 0.03)
    temp = temp.iloc[:target_num, :]
    detect_pos = temp['y'].sum()
    fdr = detect_pos / actual_pos
    return fdr

In [7]:
X_OOT.shape

(12427, 10)

### Randomized Search

In [8]:
# use your model and hyperparameters

# decision tree
dt = DecisionTreeClassifier()

# knn
knn = KNeighborsClassifier()

# decision tree
parameters_dt = {'max_features': ['auto',None,'log2'],
              'criterion': ['gini','entropy'],
              'splitter':['best','random'],
              'max_depth': range(5,100,5),
              'min_samples_split': [10,20] ,
              'min_samples_leaf': [10,15,20,30,40,50,60,70],
              'max_leaf_nodes': [None,10,20]}

# knn
parameters_knn = {'n_neighbors': range(1,11,1),
              'weights': ['uniform','distance'],
              'algorithm':['auto','ball_tree','kd_tree','brute'],
              'leaf_size': range(20,60,5),
              'p': [1,2] ,
              'metric': ['euclidean','manhattan','chebyshev','minkowski','wminkowski','seuclidean',
                         'mahalanobis']}

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
searcher = RandomizedSearchCV(dt, parameters_dt, scoring=get_fdr, n_jobs=1, cv=10)
searcher.fit(X_scaled, y)

print(searcher.best_params_)
print(searcher.best_score_)

{'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 30, 'max_leaf_nodes': None, 'max_features': None, 'max_depth': 40, 'criterion': 'gini'}
0.6285084202085005


### Modeling

## KNN

In [None]:
%%time
# Whether you want to resample or not
smote_flag = True

fdr_scores = pd.DataFrame(index=range(10), columns=['train fdr', 'test fdr', 'oot fdr'])

for m in ['euclidean','manhattan','chebyshev','minkowski','wminkowski','seuclidean',
                         'mahalanobis']:
    for k in range(9,21):
        for i in range(10):
            # Use your own model and hyperparameters here

            model = KNeighborsClassifier(n_neighbors=k,weights='uniform',metric=m)

            X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
            scaler = StandardScaler()
            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
            X_OOT_scaled = pd.DataFrame(scaler.transform(X_OOT), columns=X_OOT.columns)

            #Capped 6
            drop_index = ((X_train_scaled >= 6) | (X_train_scaled <= -6)).sum(axis=1)
            drop_index = drop_index[drop_index >= 1].index.to_list()
            X_train_scaled = X_train_scaled.drop(index=drop_index)
            y_train = pd.DataFrame(y_train).iloc[X_train_scaled.index, :]['Fraud']

            if smote_flag:
                y_neg_cnt = (y_train == 0).sum()
                smote = SMOTE(sampling_strategy={0: y_neg_cnt, 1: int(y_neg_cnt/10)})
                X_train_smo, y_train_smo = smote.fit_sample(X_train_scaled, y_train)
                model.fit(X_train_smo, y_train_smo)
            else:
                model.fit(X_train_scaled, y_train)

            fdr_scores.loc[i, 'train fdr'] = get_fdr(model, X_train_scaled, y_train)
            fdr_scores.loc[i, 'test fdr'] = get_fdr(model, X_test_scaled, y_test)
            fdr_scores.loc[i, 'oot fdr'] = get_fdr(model, X_OOT_scaled, y_OOT)

        print(fdr_scores.mean(),k,m)

## DT

In [16]:
%%time
# Whether you want to resample or not
smote_flag = True

fdr_scores = pd.DataFrame(index=range(10), columns=['train fdr', 'test fdr', 'oot fdr'])
for i in range(10):
    # Use your own model and hyperparameters here

    model = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10, max_leaf_nodes=10,min_samples_leaf=30,
                                   max_features='auto',min_samples_split=20
                                  )

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    X_OOT_scaled = pd.DataFrame(scaler.transform(X_OOT), columns=X_OOT.columns)

    #Capped 6
    drop_index = ((X_train_scaled >= 6) | (X_train_scaled <= -6)).sum(axis=1)
    drop_index = drop_index[drop_index >= 1].index.to_list()
    X_train_scaled = X_train_scaled.drop(index=drop_index)
    y_train = pd.DataFrame(y_train).iloc[X_train_scaled.index, :]['Fraud']

    if smote_flag:
        y_neg_cnt = (y_train == 0).sum()
        smote = SMOTE(sampling_strategy={0: y_neg_cnt, 1: int(y_neg_cnt/10)})
        X_train_smo, y_train_smo = smote.fit_sample(X_train_scaled, y_train)
        model.fit(X_train_smo, y_train_smo)
    else:
        model.fit(X_train_scaled, y_train)

    fdr_scores.loc[i, 'train fdr'] = get_fdr(model, X_train_scaled, y_train)
    fdr_scores.loc[i, 'test fdr'] = get_fdr(model, X_test_scaled, y_test)
    fdr_scores.loc[i, 'oot fdr'] = get_fdr(model, X_OOT_scaled, y_OOT)

print(fdr_scores.mean())

train fdr    0.622669
test fdr     0.703226
oot fdr      0.424581
dtype: float64
CPU times: user 1.38 s, sys: 95.2 ms, total: 1.48 s
Wall time: 1.48 s


In [17]:
fdr_scores

Unnamed: 0,train fdr,test fdr,oot fdr
0,0.609504,0.714286,0.435754
1,0.605809,0.709677,0.441341
2,0.650104,0.718894,0.536313
3,0.619342,0.700461,0.430168
4,0.629938,0.732719,0.435754
5,0.638037,0.686636,0.536313
6,0.630802,0.728111,0.24581
7,0.586134,0.677419,0.351955
8,0.60251,0.668203,0.418994
9,0.654506,0.695853,0.413408
