In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("vars_final_zscale.csv", index_col = 0)

In [3]:
# split data into out of date(after 11/1/10), train and test
oot_df = data.iloc[84300:,:]
trte_df = data.iloc[:84299,:]

In [4]:
train, test = train_test_split(trte_df, test_size=0.2)
train_lab = train["Fraud"]
train_fea = train.iloc[:,1:]
test_lab = test["Fraud"]
test_fea = test.iloc[:,1:]

In [5]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200]
}
# Create a based model
clf = RandomForestClassifier(n_estimators = 20, max_depth = 20, min_samples_split = 5, random_state = 42, n_jobs = 3)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = 3, verbose = 2)

In [None]:
%%time
# Fit the grid search to the data
grid_search.fit(train_fea, train_lab)
grid_search.best_params_


In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, train_fea, test_lab)

HONG Methond

In [5]:
def multipltrun(a=5,v=6,md=5,mf=5,ne=25):
    '''
    This function run model on different samples based on user input:
    "a" (int) how many randome samples, default as 5
    "v" (int) how many variables, default as 6 (most important ones from backward selection)
    users can modify model based on different machine learning algorithm and its parameters
    
    FDR is calculated by first sorting outcome in descending order and cut off at 3%,
    sum number of fraud records on top 3% and divided by total fraud racords for that sample
    
    Final output would be a dataframe contains FDR at 3% for training set, testing set, and oot. 
    '''
    #declare dict
    FDRdict={"train":[],"test":[],"oot":[]}
    
    for i in range(a):        
        #split training and testing
        train, test = train_test_split(trte_df, test_size=0.2,random_state=i)
        
        # split lables and features and v
        train_lab = train["Fraud"]
        train_fea = train.iloc[:,1:v+1]

        test_lab = test["Fraud"]
        test_fea = test.iloc[:,1:v+1]

        oot_lab=oot_df["Fraud"]
        oot_fea=oot_df.iloc[:,1:v+1]

        #define model
        model=RandomForestClassifier(n_estimators = ne, max_depth = md, min_samples_leaf = mf, random_state = 42)
        
        #fit model
        model.fit(train_fea,train_lab) #modify based on your model
    
        #calculate FDR
        for sets in ["train","test","oot"]:
            fea=vars()[sets+'_fea'] 
            lab=vars()[sets+'_lab']
            prob=pd.DataFrame(model.predict_proba(fea)) #modify based on your model
            result=pd.concat([pd.DataFrame(lab).reset_index(),prob],axis=1)
            topRows=int(round(len(result)*0.03))
            top3per=result.sort_values(by=1,ascending=False).head(topRows)
            FDR=sum(top3per.loc[:,'Fraud'])/sum(result.loc[:,'Fraud'])
            FDRdict[sets].append(FDR)
    
    #convert into dataframe
    FDR_df=pd.DataFrame(FDRdict)
   
    #add new row to calculate mean
    FDR_df.loc['mean']=FDR_df.mean()
    
    return FDR_df

In [13]:
%%time
multipltrun(a=20,v=15,md=5,mf=3,ne=100)

Wall time: 2min 37s


Unnamed: 0,train,test,oot
0,0.76703,0.69863,0.418994
1,0.762857,0.738889,0.407821
2,0.751085,0.724868,0.391061
3,0.753501,0.722892,0.430168
4,0.750696,0.746914,0.435754
5,0.780627,0.713483,0.346369
6,0.738372,0.71875,0.435754
7,0.744957,0.77957,0.374302
8,0.75565,0.767442,0.357542
9,0.755334,0.700565,0.435754


In [None]:
%%time
multipltrun(a=20,v=15,md=5,mf=3,ne=500)

In [None]:
%%time
multipltrun(a=20,v=15,md=7,mf=3,ne=500)

In [11]:
%%time
multipltrun(a=20,v=15,md=5,mf=3,ne=100)

Wall time: 2min 38s


Unnamed: 0,train,test,oot
0,0.76703,0.69863,0.418994
1,0.762857,0.738889,0.407821
2,0.751085,0.724868,0.391061
3,0.753501,0.722892,0.430168
4,0.750696,0.746914,0.435754
5,0.780627,0.713483,0.346369
6,0.738372,0.71875,0.435754
7,0.744957,0.77957,0.374302
8,0.75565,0.767442,0.357542
9,0.755334,0.700565,0.435754


In [13]:
%%time
multipltrun(a=20,v=15,mf=3,ne=100)

Wall time: 6min 54s


Unnamed: 0,train,test,oot
0,1.0,0.924658,0.469274
1,1.0,0.911111,0.502793
2,1.0,0.904762,0.513966
3,1.0,0.909639,0.463687
4,1.0,0.901235,0.50838
5,1.0,0.910112,0.480447
6,1.0,0.854167,0.553073
7,1.0,0.94086,0.463687
8,1.0,0.924419,0.497207
9,1.0,0.892655,0.52514


In [None]:
%%time
multipltrun(a=20,v=15,md=5,mf=3,ne=200)