In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# insert other models you want to use here
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier



In [None]:
# replace filepath here
data = pd.read_csv('after_feature_selection_forward_10.csv', parse_dates=['Date'])
data.head()

Unnamed: 0,total_Amount_card_description_7,total_Amount_Cardnum_7,total_Amount_card_description_3,max_Amount_card_state_0,max_Amount_card_state_14,total_Amount_Cardnum_3,total_Amount_card_description_1,total_Amount_Cardnum_1,total_Amount_Cardnum_0,total_Amount_card_description_0,...,avg_Amount_card_state_3,avg_Amount_card_zip_1,max_Amount_Cardnum_3,avg_Amount_card_zip_7,avg_Amount_card_description_30,avg_Amount_card_description_1,avg_Amount_card_merch_30,Amount,Date,Fraud
0,293.22,438.57,293.22,3.67,3.67,438.57,293.22,438.57,438.57,293.22,...,3.624545,3.624545,3.67,3.624545,3.62,3.62,3.624545,3.62,2010-01-01,0
1,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,...,31.42,31.42,31.42,31.42,31.42,31.42,31.42,31.42,2010-01-01,0
2,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,...,178.49,178.49,178.49,178.49,178.49,178.49,178.49,178.49,2010-01-01,0
3,93.2,182.84,93.2,3.8,3.8,182.84,93.2,182.84,182.84,93.2,...,3.731429,3.731429,3.8,3.731429,3.728,3.728,3.731429,3.62,2010-01-01,0
4,293.22,438.57,293.22,3.67,3.67,438.57,293.22,438.57,438.57,293.22,...,3.624545,3.624545,3.67,3.624545,3.62,3.62,3.624545,3.62,2010-01-01,0


In [None]:
train_test = data[(data.Date >= '2010-01-15') & (data.Date <= '2010-10-31')].drop(columns=['Date'])
OOT = data[data.Date > '2010-10-31'].drop(columns=['Date'])

In [None]:
X = train_test.drop(columns=['Fraud', 'Amount'])
y = train_test['Fraud']
X_OOT = OOT.drop(columns=['Fraud', 'Amount'])
y_OOT = OOT['Fraud']

In [None]:
def get_fdr(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    temp = pd.DataFrame({'y': y, 'y_pred': y_pred})
    actual_pos = y.sum()
    temp = temp.sort_values('y_pred', ascending=False)
    target_num = int(temp.shape[0] * 0.03)
    temp = temp.iloc[:target_num, :]
    detect_pos = temp['y'].sum()
    fdr = detect_pos / actual_pos
    return fdr

### Randomized Search

In [None]:
# Example, use your model and hyperparameters
lgb = LGBMClassifier(n_jobs=1)

parameters = {'num_leaves': range(18, 31, 2),
              'max_depth': range(5, 11),
              'n_estimators':range(500, 1501, 100),
              'subsample': [0.6, 0.8, 1],
              'colsample_bytree': [0.6, 0.8, 1],
              'min_data_in_leaf': range(10, 81, 10),
              'learning_rate': [0.01,0.05,0.1,0.5,1]}

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
searcher_lgb = RandomizedSearchCV(lgb, parameters, scoring=get_fdr, n_jobs=2, random_state=0, cv=10)
searcher_lgb.fit(X_scaled, y)

print(searcher_lgb.best_params_)
print(searcher_lgb.best_score_)

{'subsample': 0.8, 'num_leaves': 28, 'n_estimators': 1100, 'min_data_in_leaf': 70, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 1}
0.7514835605453087


### Modeling

In [None]:
# Whether you want to resample or not
smote_flag = True

fdr_scores = pd.DataFrame(index=range(10), columns=['train fdr', 'test fdr', 'oot fdr'])
for i in range(10):
    # Use your own model and hyperparameters here
    model = LGBMClassifier()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    X_OOT_scaled = pd.DataFrame(scaler.transform(X_OOT), columns=X_OOT.columns)
    
    #Capped 6
    drop_index = ((X_train_scaled >= 6) | (X_train_scaled <= -6)).sum(axis=1)
    drop_index = drop_index[drop_index >= 1].index.to_list()
    X_train_scaled = X_train_scaled.drop(index=drop_index)
    y_train = pd.DataFrame(y_train).iloc[X_train_scaled.index, :]['Fraud']
    
    if smote_flag:
        y_neg_cnt = (y_train == 0).sum()
        smote = SMOTE(sampling_strategy={0: y_neg_cnt, 1: int(y_neg_cnt/10)})
        X_train_smo, y_train_smo = smote.fit_sample(X_train_scaled, y_train)
        model.fit(X_train_smo, y_train_smo)
    else:
        model.fit(X_train_scaled, y_train)
        
    fdr_scores.loc[i, 'train fdr'] = get_fdr(model, X_train_scaled, y_train)
    fdr_scores.loc[i, 'test fdr'] = get_fdr(model, X_test_scaled, y_test)
    fdr_scores.loc[i, 'oot fdr'] = get_fdr(model, X_OOT_scaled, y_OOT)
    
fdr_scores

Unnamed: 0,train fdr,test fdr,oot fdr
0,0.993377,0.861751,0.284916
1,0.988938,0.852535,0.312849
2,0.984305,0.843318,0.296089
3,0.984581,0.861751,0.301676
4,0.995585,0.861751,0.301676
5,0.988584,0.852535,0.26257
6,0.988789,0.857143,0.273743
7,0.978166,0.889401,0.335196
8,0.991131,0.829493,0.284916
9,0.982869,0.884793,0.284916
