In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# insert other models you want to use here
#from sklearn.linear_model import LogisticRegression
#from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
# replace filepath here
data = pd.read_csv('after_feature_selection_forward_10.csv', parse_dates=['Date'])
data.head()

Unnamed: 0,total_Amount_card_description_7,total_Amount_card_description_3,max_Amount_card_state_14,total_Amount_Cardnum_3,total_Amount_card_description_1,total_Amount_Cardnum_0,total_Amount_card_description_0,total_Amount_Cardnum_14,max_Amount_card_state_30,max_Amount_Cardnum_3,Date,Fraud,Amount
0,293.22,293.22,3.67,438.57,293.22,438.57,293.22,438.57,3.67,3.67,2010-01-01,0,3.62
1,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,2010-01-01,0,31.42
2,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,2010-01-01,0,178.49
3,93.2,93.2,3.8,182.84,93.2,182.84,93.2,182.84,3.8,3.8,2010-01-01,0,3.62
4,293.22,293.22,3.67,438.57,293.22,438.57,293.22,438.57,3.67,3.67,2010-01-01,0,3.62


In [3]:
train_test = data[(data.Date >= '2010-01-15') & (data.Date <= '2010-10-31')].drop(columns=['Date'])
OOT = data[data.Date > '2010-10-31'].drop(columns=['Date'])

In [4]:
X = train_test.drop(columns=['Fraud', 'Amount'])
y = train_test['Fraud']
X_OOT = OOT.drop(columns=['Fraud', 'Amount'])
y_OOT = OOT['Fraud']

In [5]:
def get_fdr(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    temp = pd.DataFrame({'y': y, 'y_pred': y_pred})
    actual_pos = y.sum()
    temp = temp.sort_values('y_pred', ascending=False)
    target_num = int(temp.shape[0] * 0.03)
    temp = temp.iloc[:target_num, :]
    detect_pos = temp['y'].sum()
    fdr = detect_pos / actual_pos
    return fdr

### Randomized Search

In [7]:
# Example, use your model and hyperparameters
nn = MLPClassifier()

parameters = {'hidden_layer_sizes': [(10,1),(20,1)],
              'activation': ['relu','logistic'],
              'solver':['sgd','adam'],
              'max_iter': [200,400]}

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
searcher_nn = RandomizedSearchCV(nn, parameters, scoring=get_fdr, n_jobs=2,random_state=0, cv=10)
searcher_nn.fit(X_scaled, y)

print(searcher_nn.best_params_)
print(searcher_nn.best_score_)

{'solver': 'adam', 'max_iter': 200, 'hidden_layer_sizes': (10, 1), 'activation': 'logistic'}
0.6559743384121892


In [8]:
searcher_nn.cv_results_

{'mean_fit_time': array([5.99036419, 4.49756534, 2.65943437, 6.44868591, 8.62675674,
        4.22515974, 3.23007159, 4.84930155, 3.01313388, 8.69877777]),
 'std_fit_time': array([1.55651576, 1.49980956, 0.22835009, 0.59479035, 1.20163228,
        1.42893212, 0.99558026, 2.87526265, 1.32236304, 3.1413302 ]),
 'mean_score_time': array([0.00413671, 0.00296032, 0.00424562, 0.00375342, 0.00400558,
        0.00289359, 0.00307775, 0.00425076, 0.00386221, 0.00339775]),
 'std_score_time': array([0.00218531, 0.00047727, 0.0006383 , 0.00054113, 0.00043524,
        0.00065466, 0.00070442, 0.00086053, 0.00067595, 0.00079086]),
 'param_solver': masked_array(data=['adam', 'sgd', 'sgd', 'adam', 'adam', 'sgd', 'sgd',
                    'sgd', 'sgd', 'adam'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[200, 400, 200, 200, 200, 200, 400, 400, 400, 4

### Modeling

In [11]:
# Whether you want to resample or not
smote_flag = True

fdr_scores = pd.DataFrame(index=range(10), columns=['train fdr', 'test fdr', 'oot fdr'])
for i in range(10):
    # Use your own model and hyperparameters here
    model = MLPClassifier()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    X_OOT_scaled = pd.DataFrame(scaler.transform(X_OOT), columns=X_OOT.columns)
    
    #Capped 6
    drop_index = ((X_train_scaled >= 6) | (X_train_scaled <= -6)).sum(axis=1)
    drop_index = drop_index[drop_index >= 1].index.to_list()
    X_train_scaled = X_train_scaled.drop(index=drop_index)
    y_train = pd.DataFrame(y_train).iloc[X_train_scaled.index, :]['Fraud']
    
    if smote_flag:
        y_neg_cnt = (y_train == 0).sum()
        smote = SMOTE(sampling_strategy={0: y_neg_cnt, 1: int(y_neg_cnt/10)})
        X_train_smo, y_train_smo = smote.fit_resample(X_train_scaled, y_train)
        model.fit(X_train_smo, y_train_smo)
    else:
        model.fit(X_train_scaled, y_train)
        
    fdr_scores.loc[i, 'train fdr'] = get_fdr(model, X_train_scaled, y_train)
    fdr_scores.loc[i, 'test fdr'] = get_fdr(model, X_test_scaled, y_test)
    fdr_scores.loc[i, 'oot fdr'] = get_fdr(model, X_OOT_scaled, y_OOT)
    
fdr_scores



Unnamed: 0,train fdr,test fdr,oot fdr
0,0.826087,0.78341,0.547486
1,0.8375,0.792627,0.497207
2,0.809717,0.861751,0.547486
3,0.823158,0.83871,0.569832
4,0.824597,0.824885,0.575419
5,0.827515,0.820276,0.569832
6,0.826638,0.815668,0.575419
7,0.823529,0.861751,0.581006
8,0.83299,0.815668,0.47486
9,0.822511,0.788018,0.502793
