### **1. Importing Liabraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
import time

### **2. Loading Datasets**

In [2]:
train_data=pd.read_csv("../data/train_data.csv",index_col=0)
test_data=pd.read_csv("../data/test_data.csv",index_col=0)

In [3]:
#convert category to dummy variables in both Train and Test datasets

train_data=pd.get_dummies(train_data, drop_first=True)
test_data=pd.get_dummies(test_data, drop_first=True)

### **3. Split the Train and Test Datasets into X_train,y_train,X_test,y_test**

In [4]:
# Since our data is already splitted in Train and Test data thats why we are not using splitting function to split the data instaed we will mnaully do that  
X_train = train_data.drop('is_fraud', axis = 1)
y_train = train_data['is_fraud'] 

X_test = test_data.drop('is_fraud', axis = 1) 
y_test = test_data['is_fraud'] 

In [5]:
X_train

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,age,trans_month,...,state_WI,state_WV,state_WY,label_Train,trans_week_days_Monday,trans_week_days_Saturday,trans_week_days_Sunday,trans_week_days_Thursday,trans_week_days_Tuesday,trans_week_days_Wednesday
0,2703186189652095,4.97,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,30.0,1,...,0,0,0,1,0,0,0,0,1,0
1,630423337322,107.23,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,40.0,1,...,0,0,0,1,0,0,0,0,1,0
2,38859492057661,220.11,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,56.0,1,...,0,0,0,1,0,0,0,0,1,0
3,3534093764340240,45.00,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,51.0,1,...,0,0,0,1,0,0,0,0,1,0
4,375534208663984,41.96,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,32.0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,30560609640617,43.77,40.4931,-91.8912,519,1388534347,39.946837,-91.333331,54.0,12,...,0,0,0,0,0,0,0,1,0,0
1852390,3556613125071656,111.84,29.0393,-95.4401,28739,1388534349,29.661049,-96.186633,21.0,12,...,0,0,0,0,0,0,0,1,0,0
1852391,6011724471098086,86.88,46.1966,-118.9017,3684,1388534355,46.658340,-119.715054,39.0,12,...,0,0,0,0,0,0,0,1,0,0
1852392,4079773899158,7.99,44.6255,-116.4493,129,1388534364,44.470525,-117.080888,55.0,12,...,0,0,0,0,0,0,0,1,0,0


### **4. Resampling Datasets**

In [6]:
method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)

In [7]:
X_resampled.shape

(3685486, 82)

In [8]:
y_resampled.shape

(3685486,)

### **5. Explore RandomForestClassifier through Cross-Validation**

In [10]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=4)
cross_val_score(rf, X_resampled, y_resampled, cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.00837989, 0.06028727, 0.99979813, 0.99984046])

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_resampled, y_resampled, cv=k_fold, scoring='accuracy', n_jobs=-1)

### **6. Exploring parameter settings using GridSearchCV**

In [11]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 100, 150],
        'max_depth': [10, 20, 30, None]}

gs = GridSearchCV(rf, param,cv=5, n_jobs=-1)
gs_fit = gs.fit(X_resampled, y_resampled)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

### **7. Build our own Grid-search**

In [20]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [21]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 0.982 / Recall: 0.407 / Accuracy: 0.997
Est: 10 / Depth: 20 ---- Precision: 0.998 / Recall: 0.754 / Accuracy: 0.999
Est: 10 / Depth: 30 ---- Precision: 1.0 / Recall: 0.916 / Accuracy: 1.0
Est: 10 / Depth: None ---- Precision: 1.0 / Recall: 0.959 / Accuracy: 1.0
Est: 50 / Depth: 10 ---- Precision: 0.999 / Recall: 0.287 / Accuracy: 0.996
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.728 / Accuracy: 0.999
Est: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.93 / Accuracy: 1.0
Est: 50 / Depth: None ---- Precision: 1.0 / Recall: 0.997 / Accuracy: 1.0
Est: 100 / Depth: 10 ---- Precision: 0.996 / Recall: 0.283 / Accuracy: 0.996
Est: 100 / Depth: 20 ---- Precision: 0.999 / Recall: 0.723 / Accuracy: 0.999
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.941 / Accuracy: 1.0
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0


### **8. Final evaluation**

In [11]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)


start = time.time()
y_pred = rf_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 105.734 / Predict time: 4.712 ---- Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0
