In [None]:
#Author : Hari Thapliyal
#Project: Credit Card Fraud Detection
#Client: Kaggle Competition
#Dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud
#github repo: https://github.com/dasarpai/CapProject-CreditFraud-Detection

T#The Best Results of many experimentation
#KNN: AUC 0.92, Recall 0.82, Precision 0.74, F1 0.78
#RFC: AUC 0.98, Recall 0.89, Precision 0.04, F1 0.07
#LGBM: AUC 0.93, Recall 0.83, Precision 0.09, F1 0.16

# Learnings

# Workflow in this Notebook

In [None]:
#!pip install -U pip
#!pip install -U imblearn
#!pip install importlib_metadata
#!pip install -U scikit-learn
#!pip install Catboost
#from importlib_metadata import version
#version('scikit-learn')

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Matplotlib library to plot the charts
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns

# Import Various Classical ML Algorithm Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import statsmodels.api as sm

#Import Tree Based Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBRFClassifier as xgbc
from catboost import CatBoostClassifier
import lightgbm as lgbmc

#Import Deep Learning Libraries
import tensorflow as tf
from keras import backend as K
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Flatten, BatchNormalization, Activation
from keras.layers import MaxPool1D, AvgPool1D, GlobalAvgPool1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras import optimizers
from keras.optimizers import Adam, SGD
from keras.regularizers import l2, L1L2
from sklearn.linear_model import Perceptron


#Import Libraries for Model Evaluation
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

#Import Helping Libraries
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from imblearn import over_sampling
from collections import Counter
import itertools
from timeit import default_timer as timer

In [None]:
#So that matplot produces all the graphs in the cell below from where the graph is called. 
#Otherwise it will open in the different window.
%matplotlib inline 

#So that we don't see .... between columns. I want to see the name of all columns and their values.
pd.options.display.max_columns = 31 

sns.set_style("white")

#5 folds will be created from the given data and it will be ensured that 
#each fold has equal number of normal and fraud transactions.
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)

## <font color=red>Want to run Expensive/Time/ Resource consuming Process- CrossValidation?</font>

In [None]:
#Everytime running cross validation is taking time. 
#So if you do not want to run the code with CV and 
#want to run only with optimized Hyper Parameters then set this False

run_cv=False

## <font color=blue> Section 1: Exploratory data analysis </font>

#### Getting data from kaggle to colab dataset.

In [None]:
#!wget "https://www.kaggle.com/mlg-ulb/creditcardfraud"
filename = r'../input/creditcardfraud/creditcard.csv'
#filename=r'D:\01-Works\00-Doc-to-Sync\0-Download\Creditcard\creditcard.csv'

df = pd.read_csv(filename)
print (df.shape)

In [None]:
#observe the different feature type present in the data
df.head(3)

In [None]:
df['Class'].value_counts()/len(df)

In [None]:
df.info()
#No Null Value. All the float fields

In [None]:
df.describe().T
#Range of all the fields not same. If we take data as given for modeling some variables will 
#have more influence on the model than other so scaling need to done

In [None]:
#Can we take time as primary key field?

temp=df.groupby('Time')['Class'].count()
temp.reset_index()
print(temp[temp>1].sort_values(ascending=False))
del temp

#Time Field cannot be treated as unique identifier because it has many duplicate entries.
#it be useful in prediction fraud.

In [None]:
df['Amount'].describe(percentiles=[.25,.50,.75,.90,.95,.99])
#99% Transaction are less than $1018

In [None]:
amt99 = 1018
df99 = df.iloc[ list(df['Amount']<=amt99) ]
df01 = df.iloc[ list(df['Amount']>amt99) ]

In [None]:
#Distribution of class along the time
fig,(ax1,ax2) = plt.subplots(ncols=2, figsize=(20,4))
start=timer()

plt.figure(figsize=(10,5))
#plt.title ("Distribution of class along the time")

class_0 = df99.loc[df['Class'] == 0]["Time"]
class_1 = df99.loc[df['Class'] == 1]["Time"]
sns.distplot(class_0,hist=False,rug=False,label='Not Fraud', color="red", ax=ax1, kde_kws=dict(linewidth=5)).set(xlim=0)
sns.distplot(class_1,hist=False,rug=False,label='Fraud', ax=ax1, color="blue")
sns.distplot(df99.Time,hist=False,rug=False,label='All-Transactions', ax=ax1, color="yellow")
ax1.set_title   ("Distribution of class along the time - 99 Percentile of Transactions")
#ax1.legend(bbox_to_anchor=(.25, 1))
ax1.legend(loc='upper left')

class_0 = df01.loc[df['Class'] == 0]["Time"]
class_1 = df01.loc[df['Class'] == 1]["Time"]
sns.distplot(class_0,hist=False,rug=False,label='Not Fraud', color="red", kde_kws=dict(linewidth=5), ax=ax2).set(xlim=0)
sns.distplot(class_1,hist=False,rug=False,label='Fraud', color="blue", ax=ax2)
sns.distplot(df01.Time,hist=False,rug=False,label='All-Transactions', color="yellow", ax=ax2)
ax2.set_title ("Distribution of class along the time - 1 Percentile of Transactions")
# ax2.legend(bbox_to_anchor=(-.25, 1))
ax2.legend(loc='upper left')

plt.show()

end = timer()
print("Duration ",end - start)

#Distribution of data for fraud and non-faud transactions are different. 
#Normal transaction and sum total of all transaction following same distribution. Because fraud trans are very few.
#We have two days data with us so it shows two peack in normal transaction.
#I am not able to identify any signfificant pattern in fraud transaction from two days data.

In [None]:
#Distribution of class along the time
fig,(ax1,ax2) = plt.subplots(ncols=2, figsize=(20,4))
start=timer()

plt.figure(figsize=(10,5))
#plt.title ("Distribution of class along the time")

class_0 = df99.loc[df['Class'] == 0]["Amount"]
class_1 = df99.loc[df['Class'] == 1]["Amount"]
sns.distplot(class_0,hist=False,rug=False,label='Not Fraud', color="red", ax=ax1, kde_kws=dict(linewidth=5)).set(xlim=0)
sns.distplot(class_1,hist=False,rug=False,label='Fraud', ax=ax1, color="blue")
sns.distplot(df99.Amount,hist=False,rug=False,label='All-Transactions', ax=ax1, color="yellow")
ax1.set_title   ("Distribution of class with Amount - 99 Percentile of Transactions")
#ax1.legend(bbox_to_anchor=(.25, 1))
ax1.legend(loc='upper right')

class_0 = df01.loc[df['Class'] == 0]["Amount"]
class_1 = df01.loc[df['Class'] == 1]["Amount"]
sns.distplot(class_0,hist=False,rug=False,label='Not Fraud', color="red", kde_kws=dict(linewidth=5), ax=ax2).set(xlim=0)
sns.distplot(class_1,hist=False,rug=False,label='Fraud', color="blue", ax=ax2)
sns.distplot(df01.Amount,hist=False,rug=False,label='All-Transactions', color="yellow", ax=ax2)
ax2.set_title ("Distribution of class with Amount - 1 Percentile of Transactions")
# ax2.legend(bbox_to_anchor=(-.25, 1))
ax2.legend(loc='upper right')

plt.show()

end = timer()
print("Duration ",end - start)

#Distribution of data for fraud and non-fraud transactions are different. 
#Normal transaction and sum total of all transaction following same distribution. Because fraud trans are very few.
#In 99 percentile we see most of the transactions are less than $200
#in 1 percentile we see a right tailed bell curve with a mean around $3000. 
#for this 1 percentile data interesting around this mean we see spike of fraud transaction

#### Analysis of Fraud against the Transaction Value

In [None]:
max_amount=int(round(max(df.Amount) *1.04,-3))
bins=list(range(0,1601,100))
bins.append(max_amount)
df['Amt']=pd.cut(df.Amount,bins)

In [None]:
all_trans = df.pivot_table(index="Amt",columns="Class",values="V1",aggfunc=len) 
all_trans['All'] = all_trans[0] +all_trans[1]
all_trans = all_trans.drop( 0, axis=1)
all_trans.columns=['Fraud','All']
all_trans['Fraud %'] = all_trans['Fraud']  / len( df[df.Class==1])*100
all_trans['All %'] = all_trans['All']  / len( df.Class)*100
all_trans
#79.5% of the transactions are less than $100 but fraud in that category is 68%.
#There are 9.57% transactions between $200 and 1600 but fraud is 16.86%

In [None]:
all_trans[['All %','Fraud %']].plot(figsize=(20, 4))
plt.show()

#### Let us see distribution of various columns

In [None]:
start = timer()
#Distribution of the data
plt.figure(figsize=(20,8))
i=1
for col in df.columns[0:30]:
    plt.subplot(3,11,i)
    sns.distplot(df[col])
    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start)

#Except time and amount all fields looks having single peak bell curve. Although some of the fields are skwed right
#side and some looks skewed left side. So we can use power-transformer -yeo-johnson to handle this issue.

#### Here we will observe the distribution of our classes

In [None]:
# classes=df['Class'].value_counts()
# normal_share=classes[0]/df['Class'].count()*100
# fraud_share=classes[1]/df['Class'].count()*100

# amt=100*df.groupby("Class").sum()["Amount"] / sum(df.Amount)

# print ("Normal Transaction {:.2f}  Fraud Transaction {:.2f}  ".format (classes[0],classes[1]))
# print ("Normal Transaction {:.2f}%  Fraud Transaction {:.2f}%  ".format (normal_share,fraud_share))
# print ("Value of Normal Transaction {:.2f}%  Fraud Transaction {:.2f}%  ".format (amt[0],amt[1]))

# fraud_amt=df.loc[df['Class'] == 1]["Amount"]
# print ("\nAverage Value {:.2f} Min Value {:.2f}  Max Value {:.2f} Fraud Transactions".format( np.average(fraud_amt), np.min(fraud_amt), np.max(fraud_amt)))

# fraud_0amt_trans=len ( df.loc[  (df['Class'] == 1) & (df['Amount']==0)  ] )
# print ("\n# Fraud Transactions of 0 Value = ",fraud_0amt_trans)

#### Let us Check Coorelation Between Different Variables

In [None]:
df_corr = df.corr()
plt.figure(figsize=(16,8))
sns.heatmap(df_corr, cmap="YlGnBu") # Displaying the Heatmap
#sns.set(font_scale=.5,style='white')

plt.title('Heatmap correlation')
plt.show()

#Since this is is PCA data it looks there is no relationship between the given variables. Except time and amount has
#some sort of relationship with other fields.

#### Is there an relationship between variables and Class?

In [None]:

rank = pd.DataFrame(df_corr['Class']) #.sort_values(ascending=False)
rank['Relationship'] = rank.Class.apply(lambda x: "+" if x>0 else "-")
rank.Class=abs(rank.Class)
rank.rename(columns={"Class":"Degree of Relation with Class"}, inplace=True)
rank.sort_values("Degree of Relation with Class", ascending=False)

#Some variables show positive relationship with fraud and some negative. For example V17 & Fraud share share -ve
#relation. More the value of V17 lesser are the chances of this being fraud.
#V11 has +ve relationship. More the value of V11 more are the chances that transation is fraud.
#Value of transaction (amount) has very weak relation with class (fraud / non-fraud transaction) only .005 or .5 %
#Time of transaction has also has a very weak relation with class (fraud / non-fraud transaction) only .012 or 1.2 %

#### <font color=red>Note 1: Because I don't have enough computing resources available (I tried nimblebox, google colab, kaggle but that also too slow), therfore I am NOT using full given dataset for the model building.</font> </br>

#### <font color=red>Note 2: We need to have modular code to run same models with different parameters and differnt data imbalance treatment otherwise it will be extremely difficult to maintain the code. Therefore I have taken different approach. Which you can observer while scanning and running the code.
</font>

In [None]:
df.drop(columns="Amt", inplace=True) #This field was created for binning purpose so not required for modeling

In [None]:
#Training model on huge dataset taking too much time so taking 5% of data initially. 
#Fraud class dataset is very small so adding it fully to the new dataset.
#When code start working fine and start doing reasonable prediction disable below line
#If you have enough resources you can disable this cell

df = pd.concat([ df.sample(frac=.05, random_state=1),df.loc[df.Class==1] ])
print (df.shape)

#### Let us see Distribution of data for the given 2 Classes

In [None]:

start=timer()

color = sns.color_palette("Set1", 6)
plt.figure(figsize=(20,20))
i=1

for col in df.columns:
    plt.subplot(8,4,i)
    ax=sns.boxplot(x=df['Class'],y=df[col],  palette=color)
    
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2.,
                 p.get_height()), ha = 'center', va = 'center', 
                fontsize=8,
                xytext = (0, 10), textcoords = 'offset points')
    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start)

#Almost all fields has outlier values.
#Median of data for both the classes is almost same for all variables.
#IQR range is significantly different for some of the variables like v1, v4, v5, v12, v14,v16, v17, v18

#### Creating a scatter plot 
#### To observe the relation between amount and other variable for both the classes.

In [None]:

start=timer()

plt.figure(figsize=(20,80))
i=1
for col in df.columns:
    plt.subplot(16,2,i)
    
    sns.regplot(y=df[df.Class==0].Amount,x=df[df.Class==0][col], color="g")
    ax= sns.regplot(y=df[df.Class==1].Amount,x=df[df.Class==1][col], color="r")
    ax.tick_params(labelsize=15)
    ax.set_xlabel(col,fontsize=15)

    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start)

#For most of the variables slope of amount field for non-fraud transaction has some slope. 
#But there is almost no slope for fraud transaction.

### Standard Scale All the Fields Including Time and Amount

In [None]:
cols2Scale = list(df.columns)
cols2Scale.remove("Class")

In [None]:
sc=StandardScaler()
t=sc.fit_transform(df[ cols2Scale ])
df[cols2Scale] = t
df.head(5)

In [None]:
df.describe().T

### Plotting the distribution of a variable- After Scaling

In [None]:
# plot the histogram of a variable from the dataset to see the skewness
#Distribution of the data
start=timer()

plt.figure(figsize=(20,8))
i=1
for col in df.columns[1:30]:
    plt.subplot(3,11,i)
    sns.distplot(df[col])
    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start)

### Some fields are still skewed. So using PowerTransformer to fix that issue.
- <b>Power Transformer</b> package present in the <b>preprocessing library provided by sklearn</b> to make distribution more gaussian

In [None]:
# - Apply : preprocessing.PowerTransformer(copy=False) to fit & transform the train & test data
#'yeo-johnson’ [1], works with positive and negative values
#‘box-cox’ [2], only works with strictly positive values

df[cols2Scale] = power_transform(df[cols2Scale], method='yeo-johnson')

### Plotting the distribution of a variable- After Correcting Skweness Issue

In [None]:
# Re-plot the histogram of a variable from the dataset to see the skewness
#Distribution of the data
start=timer()

plt.figure(figsize=(20,8))
i=1
for col in df.columns[1:30]:
    plt.subplot(3,11,i)
    sns.distplot(df[col])
    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start) 

In [None]:
#Re-Check Coorelation Between Different Variables
df_corr = df.corr()
plt.figure(figsize=(16,8))
sns.heatmap(round(df_corr,1), annot=True, cbar=False, cmap="YlGnBu", ) # Displaying the Heatmap
#sns.set(font_scale=.5,style='white')

plt.title('Heatmap correlation')
plt.show()

#After scaling and normalising variable looks there is some kind of relationship between different fields.

### Check the outliers after fixing skewness & scale issue

In [None]:
#Distribution of data acrros 2 Classes
start=timer()

color = sns.color_palette("Set1", 6)
plt.figure(figsize=(20,20))
i=1

for col in df.columns:
    plt.subplot(8,4,i)
    ax=sns.boxplot(x=df['Class'],y=df[col],  palette=color)
    
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2.,
                 p.get_height()), ha = 'center', va = 'center', 
                fontsize=8,
                xytext = (0, 10), textcoords = 'offset points')
    i+=1
plt.tight_layout()

end = timer()
print("Duration ",end - start)

#Now dataset looks quite balance for fraud and non-fraud transaction. 
#Almost same ratio of IQR width, and almost same median value for fraud/non-fraud for each varaible.

# <font color=blue> Section 2: Splitting the data into Train & Test

In [None]:
def split_data_normal(df):
    X= df.drop(columns=["Class"], axis=0)
    y= df.Class #class variable

    X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,stratify=y, test_size=0.25, random_state=100)
    return (X_train1, X_test1, y_train1, y_test1)

def split_data_ros(X_train1, X_test1, y_train1, y_test1):
    #Oversample the fraud transaction data using RandomOverSampler Method
    ros = over_sampling.RandomOverSampler(random_state=100)
    X_train_ros, y_train_ros       = ros.fit_resample(X_train1, y_train1)
    X_test_ros,  y_test_ros        = (X_test1, y_test1)
    return (X_train_ros, X_test_ros, y_train_ros, y_test_ros)

def split_data_smote(X_train1, X_test1, y_train1, y_test1):
    #OverSample the fraud transaction data using Smoth method
    smt = over_sampling.SMOTE(random_state=100)
    X_train_smote, y_train_smote   = smt.fit_resample(X_train1, y_train1)
    X_test_smote,  y_test_smote    = (X_test1, y_test1)
    return (X_train_smote, X_test_smote, y_train_smote, y_test_smote)

def split_data_adasyn(X_train1, X_test1, y_train1, y_test1):
    #OverSample the fraud transaction data using AdaSyn
    ada = over_sampling.ADASYN(random_state=100)
    X_train_adasyn, y_train_adasyn = ada.fit_resample(X_train1, y_train1)
    X_test_adasyn,  y_test_adasyn  = (X_test1, y_test1)
    return (X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn)

In [None]:
X_train1, X_test1, y_train1, y_test1  = split_data_normal(df)
X_train_ros, X_test_ros, y_train_ros, y_test_ros              = split_data_ros(X_train1, X_test1, y_train1, y_test1)
X_train_smote, X_test_smote, y_train_smote, y_test_smote      = split_data_smote(X_train1, X_test1, y_train1, y_test1)
X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn  = split_data_adasyn(X_train1, X_test1, y_train1, y_test1)

In [None]:
y_train1.value_counts(), y_test1.value_counts()

# <font color=blue>Section 3: Visuzalise Results of Various Oversampling Methods</font>

### Random Oversampling

In [None]:
X_train_ros_1 = X_train_ros[X_train1.shape[0]:]

X_train_1 = X_train1.to_numpy()[np.where(y_train1==1.0)]
X_train_0 = X_train1.to_numpy()[np.where(y_train1==0.0)]

plt.rcParams['figure.figsize'] = [20, 10]
fig = plt.figure()

plt.subplot(1, 3, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_ros_1.iloc[:X_train_1.shape[0], 0], 
            X_train_ros_1.iloc[:X_train_1.shape[0], 1],
            label='Artificial ROS Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], 
            X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

### SMOTE : Synthetic Minority Over-sampling Technique

In [None]:
X_train_smote_1 = X_train_smote[X_train1.shape[0]:]

X_train_1 = X_train1.to_numpy()[np.where(y_train1==1.0)]
X_train_0 = X_train1.to_numpy()[np.where(y_train1==0.0)]


plt.rcParams['figure.figsize'] = [20, 10]
fig = plt.figure()

plt.subplot(1, 3, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_smote_1.iloc[:X_train_1.shape[0], 0], 
            X_train_smote_1.iloc[:X_train_1.shape[0], 1],
            label='Artificial SMOTE Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

### ADASYN : Adaptive Synthetic Sampling Method

In [None]:
X_train_adasyn_1 = X_train_adasyn[X_train1.shape[0]:]

X_train_1 = X_train1.to_numpy()[np.where(y_train1==1.0)]
X_train_0 = X_train1.to_numpy()[np.where(y_train1==0.0)]

plt.rcParams['figure.figsize'] = [20, 10]
fig = plt.figure()

plt.subplot(1, 3, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_adasyn_1.iloc[:X_train_1.shape[0], 0], 
            X_train_adasyn_1.iloc[:X_train_1.shape[0], 1],
            label='Artificial ADASYN Class-1 Examples')
plt.legend()

plt.subplot(1, 3, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

# <font color=blue> Section 4: Select the Dataset Imbalancing Method</font>

In [None]:
model_with_data_list=['Normal','RandomOverSampler','Smote','AdaSyn']
def select_dataset(option=-1):
    if option==0:
        X_train, X_test, y_train, y_test = X_train1, X_test1, y_train1, y_test1
    elif option==1:
        X_train, X_test, y_train, y_test = X_train_ros, X_test_ros, y_train_ros , y_test_ros
    elif option==2:
        X_train, X_test, y_train, y_test = X_train_smote, X_test_smote, y_train_smote , y_test_smote
    elif option==3:
        X_train, X_test, y_train, y_test = X_train_adasyn, X_test_adasyn, y_train_adasyn , y_test_adasyn


    print ("\n\nRunning Model with **",model_with_data_list[option],"Data")
    
    print('Transaction Records in Train',len(y_train))
    print('Transaction Records in Test',len(y_test))
    print('Total Fraud Transaction Records',np.sum(y_train) + np.sum(y_test))
    print('Fraud Transaction Records in Train',np.sum(y_train))
    print('Fraud Transaction Records in Test',np.sum(y_test))
    
    return (X_train, X_test, y_train, y_test)


# <font color=blue> Section 5: Helper Functions for Model Building </font>

In [None]:
# Plotting cv results
def draw_cv_results(cv_df, param_name, metric_name, title_name, log_true):
    plt.figure(figsize=(10,4))
    
    if log_true:
        x_axis= np.log10( list(cv_df[param_name]))
        x_axis_title = "Log Value "+param_name
    else:
        x_axis = list(cv_df[param_name])
        x_axis_title = param_name
        
    plt.plot( x_axis, cv_df['mean_train_score'] )
    plt.plot( x_axis, cv_df['mean_test_score'] )
    plt.xlabel(x_axis_title)
    plt.ylabel(metric_name)
    plt.title(title_name)
    plt.legend(['Train ' + metric_name +' score', 'Test ' +metric_name+' score'], loc='upper left')


In [None]:
# Function to plot the confusion Matrix
def draw_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.tab10):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
   
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)

    fmt = 'd' 
    thresh = cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.fill(j,i,facecolor="red" if cm[i, j] < thresh else "blue", edgecolor='b', linewidth=2)
        #plt.Rectangle((0, 0), 1, 0, linewidth=1, edgecolor='b', facecolor='none')
        
        plt.text(j, i, format(cm[i, j], fmt), fontsize=20, weight="bold", 
                 verticalalignment='center',
                 horizontalalignment="center",
                 color="white", 
                 
                bbox=dict(facecolor='red', alpha=0.8))

    plt.tight_layout()
    plt.ylabel('True label',fontsize=18)
    plt.xlabel('Predicted label', fontsize=18)

In [None]:
class_names=[0,1]
def draw_roc( actual, probs, prob_values=True, Threshold_limit=0.5 ):
  
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                          drop_intermediate = False )
    
    threshold = thresholds[np.argmax(tpr-fpr)]

    if prob_values:
        pred= probs.map(lambda x: 1 if x > threshold else 0)
    else:
        pred= probs
        
    auc_score = round( metrics.roc_auc_score( actual, probs) ,2)
    
    recall    = round( metrics.recall_score(actual,pred),2)
    precision = round(metrics.precision_score(actual, pred),2)
    f1= round(metrics.f1_score(actual,pred),2)
    print ("This Model Result is for ", model_with_data_list[option], " Data")
    print ("ROC AUC Score on Test:",auc_score," Threshold:{:.5f}".format(threshold))

    plt.figure(figsize=(20, 5))
    plt.subplot(1,2,1)
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]',fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('Receiver operating characteristic example',fontsize=20)
    plt.legend(loc="lower right")
    
    plt.subplot(1,2,2)
    cm = confusion_matrix(actual, pred)
    draw_confusion_matrix(cm,class_names)
    plt.show()

    return auc_score,recall,precision,f1

# <font color=blue> Section 6: Model Building </font>

### Model 1: Logistic Regression

In [None]:
# Cross validation using different values of C. Let's check which value of C gives best result
def logistic_cv():
    start=timer()
    if run_cv:
        hyper_params = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }]

        lr = LogisticRegression(max_iter=1000, random_state=100)
        lr.fit(X_train, y_train)      

        model_cv_logistic = GridSearchCV(estimator=lr, param_grid=hyper_params, \
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_logistic.fit(X_train,  y_train)

        cv_results_lr = pd.DataFrame(model_cv_logistic.cv_results_)
        print(cv_results_lr)

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        draw_cv_results(cv_results_lr, 'param_C', 'ROC AUC', 'Optimal C', True)
        
    if run_cv:
        model_cv_logistic.best_estimator_

In [None]:
#There are 29 variables in X_train. Let's see can we manage good result with lessor variables.
def logistic_with_rfe():
    if option==0 or option==1:
        lr = LogisticRegression(C=.1,max_iter=1000, random_state=100) 
        #Value of C & max_iter from earlier steps #for normal
    elif option==2:
        lr = LogisticRegression(C=10,max_iter=1000, random_state=100) 
        #Value of C & max_iter from earlier steps #for smote
    else:
        lr = LogisticRegression(C=100,max_iter=1000, random_state=100) 
        #Value of C & max_iter from earlier steps #for adasyn
    rfe = RFE(lr,15) #Identify top 15 important variables
    rfe.fit(X_train,y_train)
    useful_cols = X_train.columns[rfe.support_]
    #print ("Useful Columns: " ,useful_cols)
    
    #check the ranking of these variables
    print('Important Variables Identified in RFE')
    print(list(zip(X_train.columns, rfe.support_, rfe.ranking_)))
    return useful_cols

In [None]:
def logistic():
    #Hyperparameter Turning of LogisticRegression
    #Train the model with only 15 variables (identified in earlier step)

    useful_cols = logistic_with_rfe()
    X_train_= X_train[useful_cols]
    X_test_ = X_test[useful_cols]
    lr = LogisticRegression(C=.1,max_iter=1000, random_state=100)
    lr.fit(X_train_,y_train)

    y_test_pred_logistic = lr.predict_proba(X_test_)
    y_test_pred_logistic = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_logistic[:,1:2].reshape(-1) })

    #check ROC_AUC Score on Test
    logistic_auc_test, logistic_recall_test, logistic_precision_test, logistic_f1_test = \
                            draw_roc(y_test_pred_logistic.Class, y_test_pred_logistic.Class_Prob, True)
    return (logistic_auc_test, logistic_recall_test, logistic_precision_test, logistic_f1_test)

### Model 2: GLM

In [None]:
def glm():
    useful_cols = logistic_with_rfe()
    X_train_sm = sm.add_constant(X_train[useful_cols])
    X_test_ = X_test[useful_cols]
    
    glm = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
    glm = glm.fit()

    X_test_sm = sm.add_constant(X_test_)
    y_test_pred_glm = glm.predict( X_test_sm  )
    y_test_pred_glm = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_glm })

    #check ROC_AUC Score on Test
    glm_auc_test, glm_recall_test, glm_precision_test, glm_f1_test = \
                        draw_roc(y_test_pred_glm.Class, y_test_pred_glm.Class_Prob)
    
    print (glm.summary())
    return (glm_auc_test, glm_recall_test, glm_precision_test, glm_f1_test)

### Model 3: KNeighborsClassifier

In [None]:
# Cross validation using different values of n_neighbors.
def knn_cv():
    start=timer()

    if run_cv:
        hyper_params = [{'n_neighbors': range(2,15,2) }]
        knn = KNeighborsClassifier()
        model_cv_knn = GridSearchCV(estimator=knn, param_grid=hyper_params, \
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_knn.fit(X_train,  y_train)

        cv_results_knn = pd.DataFrame(model_cv_knn.cv_results_)
        print(cv_results_knn.sort_values('rank_test_score'))

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        draw_cv_results(cv_results_knn, 'param_n_neighbors', 'ROC AUC', 'Optimal n_neighbours', False)
    
    if run_cv:
        print(model_cv_knn.best_estimator_)

In [None]:
#Hyperparameter Turning of KNeighborsClassifier
def knn():
    knn = KNeighborsClassifier(n_neighbors = 4, leaf_size=30, p=2)
    knn.fit(X_train, y_train)
    y_test_pred_knn = knn.predict_proba(X_test)

    y_test_pred_knn = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_knn[:,1:2].reshape(-1) })


    knn_auc_test, knn_recall_test, knn_precision_test,knn_f1_test = \
                            draw_roc(y_test_pred_knn.Class, y_test_pred_knn.Class_Prob)
    
    return (knn_auc_test, knn_recall_test, knn_precision_test,knn_f1_test)


### Model 4: RandomForestClassifier

In [None]:
# Cross validation using different values of C. Let's check which value of C gives best result
def rfc_cv():
    start=timer()
    if run_cv:
        hyper_params=[{'n_estimators':range(4,20,2),'max_depth':range(8,25,2)}]

        rfc = RandomForestClassifier()
        model_cv_rfc = GridSearchCV(estimator=rfc, param_grid=hyper_params, \
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_rfc.fit(X_train,  y_train)

        cv_results_rfc = pd.DataFrame(model_cv_rfc.cv_results_)
        print(cv_results_rfc.sort_values('rank_test_score'))

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        cv_results_rfc.param_max_depth = cv_results_rfc.param_max_depth.astype("float")
        plt.figure(figsize=(20,4))
        plt.subplot(1,2,1)
        ax1=sns.lineplot(x='param_n_estimators',  y='mean_test_score', hue='param_max_depth', data=cv_results_rfc)
        ax1.set_title("Test AUC Score")
        plt.subplot(1,2,2)
        ax2= sns.lineplot(x='param_n_estimators', y='mean_train_score', hue='param_max_depth',data=cv_results_rfc, ci=0)
        ax2.set_title("Train AUC Score")
        plt.show()
        
    if run_cv:
        print(model_cv_rfc.best_estimator_)

In [None]:
#Optimised Hyper Parameters
def rfc():
    rfc= RandomForestClassifier(n_estimators=16, criterion="gini", max_depth=6, random_state=100)
    rfc.fit(X_train,y_train)
    y_test_pred_rfc = rfc.predict_proba(X_test)[:,1:2]

    y_test_pred_rfc = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_rfc.reshape(-1) })

    rfc_auc_test, rfc_recall_test, rfc_precision_test, rfc_f1_test = \
                            draw_roc(y_test_pred_rfc.Class, y_test_pred_rfc.Class_Prob)
    return (rfc_auc_test, rfc_recall_test, rfc_precision_test, rfc_f1_test)

### Model 5: DecisionTreeClassifier

In [None]:
# Cross validation using different values of C. Let's check which value of C gives best result
def dtc_cv():
    start=timer()

    if run_cv:
        hyper_params=[{'max_depth': range(10,20,2),
                   'min_samples_leaf': range(1, 5, 1),
                   'min_samples_split': range(1, 5, 1) }]

        dtc = DecisionTreeClassifier(random_state=100)
        model_cv_dtc = GridSearchCV(estimator=dtc, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_dtc.fit(X_train,  y_train)

        cv_results_dtc = pd.DataFrame(model_cv_dtc.cv_results_)
        print(cv_results_dtc.sort_values('rank_test_score'))

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        cv_results_dtc.param_max_depth = cv_results_dtc.param_max_depth.astype("float")

        plt.figure(figsize=(20,4))
        plt.subplot(1,4,1)
        ax1=sns.lineplot(x='param_min_samples_leaf', y='mean_test_score', hue='param_max_depth',data=cv_results_dtc, ci=0)
        ax1.set_title("Test AUC Score")
        plt.subplot(1,4,2)
        ax2= sns.lineplot(x='param_min_samples_leaf', y='mean_train_score', hue='param_max_depth',data=cv_results_dtc, ci=0)
        ax2.set_title("Train AUC Score")

        plt.subplot(1,4,3)
        ax1=sns.lineplot(x='param_min_samples_split', y='mean_test_score', hue='param_max_depth',data=cv_results_dtc, ci=0)
        ax1.set_title("Test AUC Score")
        plt.subplot(1,4,4)
        ax2= sns.lineplot(x='param_min_samples_split', y='mean_train_score', hue='param_max_depth',data=cv_results_dtc, ci=0)
        ax2.set_title("Train AUC Score")
        plt.show()
    
    if run_cv:
        print(model_cv_dtc.best_estimator_)

In [None]:
#Optimised Hyper Parameters
def dtc():
    dtc= DecisionTreeClassifier(max_depth=16, min_samples_leaf=1, min_samples_split=2, random_state=100)

    dtc.fit(X_train,y_train)
    y_test_pred_dtc = dtc.predict_proba(X_test)[:,1:2]

    y_test_pred_dtc = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_dtc.reshape(-1) })

    dtc_auc_test, dtc_recall_test, dtc_precision_test, dtc_f1_test = \
                        draw_roc(y_test_pred_dtc.Class, y_test_pred_dtc.Class_Prob, True)
    return (dtc_auc_test, dtc_recall_test, dtc_precision_test, dtc_f1_test)


### Model 6: LGBM

In [None]:
# Cross validation using different values of n_estimators.
def lgbm_cv():
    start=timer()
    if run_cv:
        hyper_params=[{  'n_estimators': range(10,110,10)  }]

        lgbm_clf = lgbmc.LGBMClassifier(random_state=100)
        model_cv_lgbm = GridSearchCV(estimator=lgbm_clf, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_lgbm.fit(X_train,  y_train)

        cv_results_lbgm = pd.DataFrame(model_cv_lgbm.cv_results_)
        cv_results_lbgm.sort_values('rank_test_score')

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        draw_cv_results(cv_results_lbgm, 'param_n_estimators', 'ROC AUC', 'Optimal n_estimator', False)
    if run_cv:
        print(model_cv_lgbm.best_estimator_)

In [None]:
# Optimized Hyper Paramters.
def lgbm():
    lgbm_clf = lgbmc.LGBMClassifier(n_estimators=100, random_state = 42)

    lgbm_clf.fit(X_train,y_train)
    y_test_pred_lgbm = lgbm_clf.predict_proba(X_test)[:,1:2]

    y_test_pred_lgbm = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_lgbm.reshape(-1) })

    lgbm_auc_test, lgbm_recall_test, lgbm_precision_test, lgbm_f1_test = \
                        draw_roc(y_test_pred_lgbm.Class, y_test_pred_lgbm.Class_Prob,True,.5)
    return (lgbm_auc_test, lgbm_recall_test, lgbm_precision_test, lgbm_f1_test)

### Model 7: Perceptron

In [None]:
# Cross validation using different values of n_iter_no_change
def perceptron_cv():
    start=timer()
    if run_cv:
        hyper_params=[{'n_iter_no_change': [ 5,6,7,8,9] }]

        percept = Perceptron(random_state = 42)
        model_cv_percept = GridSearchCV(estimator=percept, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_percept.fit(X_train,  y_train)

        cv_results_percept = pd.DataFrame(model_cv_percept.cv_results_)
        cv_results_percept.sort_values('rank_test_score')

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        draw_cv_results(cv_results_percept, 'param_n_iter_no_change', 'ROC AUC', 'Optimal n_estimator', False)
    if run_cv:
        print(model_cv_percept.best_estimator_)

In [None]:
# Optmized Hyper Parameter
def perceptron():
    percept = Perceptron(alpha=.00001,n_iter_no_change=7,random_state = 42, penalty="l2")

    percept.fit(X_train,y_train)
    y_test_pred_percept = percept.predict(X_test)#[:,1:2]

    y_test_pred_percept = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_percept.reshape(-1) })

    percept_auc_test, percept_recall_test, percept_precision_test, percept_f1_test = \
                            draw_roc(y_test_pred_percept.Class, y_test_pred_percept.Class_Prob, False)
    
    return (percept_auc_test, percept_recall_test, percept_precision_test, percept_f1_test)

### Model 8: SVC

In [None]:
#hyper_params=[{'C': range(1,30,1) }]
def svc_cv():
    start=timer()
    if run_cv:
        hyper_params=[{'C': range(10,30,2) }]

        svmc = SVC(random_state = 100)
        model_cv_svm = GridSearchCV(estimator=svmc, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_svm.fit(X_train,  y_train)

        cv_results_svm = pd.DataFrame(model_cv_svm.cv_results_)
        cv_results_svm.sort_values('rank_test_score')

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        draw_cv_results(cv_results_svm, 'param_C', 'ROC AUC', 'Optimal n_estimator', False)
        
    if run_cv:
        print(model_cv_svm.best_estimator_)

In [None]:
#Optimized Hyper Parameters
def svc():
    svmc = SVC(C=20.0,random_state=100, probability=True).fit(X_train,y_train)
    y_test_pred_svm = svmc.predict(X_test)
    y_test_pred_svm = svmc.predict_proba(X_test)[:,1:2]
    y_test_pred_svm = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_svm.reshape(-1) })

    svm_auc_test, svm_recall_test, svm_precision_test, svm_f1_test = \
                        draw_roc(y_test_pred_svm.Class, y_test_pred_svm.Class_Prob, True)
    
    return (svm_auc_test, svm_recall_test, svm_precision_test, svm_f1_test)

### Model 9: XGBoost

In [None]:
#Cross Validation
def xgb_cv():
    start=timer()
    if run_cv:
        hyper_params=[{'max_depth': range(10,15,1), 'n_estimators': range(95,120,2) }]

        xgb_clf = xgb(random_state = 100)
        model_cv_xgb = GridSearchCV(estimator=xgb_clf, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_xgb.fit(X_train,  y_train)

        cv_results_xgb = pd.DataFrame(model_cv_xgb.cv_results_)
        cv_results_xgb.sort_values('rank_test_score')

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        cv_results_xgb.param_max_depth = cv_results_xgb.param_max_depth.astype("float")
        cv_results_xgb.param_n_estimators = cv_results_xgb.param_n_estimators.astype("float")
        plt.figure(figsize=(20,4))
        plt.subplot(1,2,1)
        ax1=sns.lineplot(x='param_n_estimators',  y='mean_test_score', hue='param_max_depth', data=cv_results_xgb)
        ax1.set_title("Test AUC Score")
        plt.subplot(1,2,2)
        ax2= sns.lineplot(x='param_n_estimators',  y='mean_train_score', hue='param_max_depth',data=cv_results_xgb)
        ax2.set_title("Train AUC Score")
        plt.show()

    if run_cv:
        print(model_cv_xgb.best_estimator_)

In [None]:
#Optimized Hyper Parameters
def xgb():
    xgb_clf = xgbc(max_depth=10, n_estimators=95, learning_rate=.01,random_state=100).fit(X_train,y_train)
    y_test_pred_xgb = xgb_clf.predict_proba(X_test)[:,1:2]
    y_test_pred_xgb = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_xgb.reshape(-1) })

    xgb_auc_test, xgb_recall_test, xgb_precision_test, xgb_f1_test = \
                        draw_roc(y_test_pred_xgb.Class, y_test_pred_xgb.Class_Prob, True)
    
    return (xgb_auc_test, xgb_recall_test, xgb_precision_test, xgb_f1_test)

### Model 10: Adaboost

In [None]:
#Cross Validation
def adaboost_cv():
    sart=timer()
    if run_cv:
        hyper_params=[{'learning_rate': range(1,5,1), 'n_estimators': range(40,71,10) }]

        adbc = AdaBoostClassifier(random_state = 100)
        model_cv_adbc = GridSearchCV(estimator=adbc, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_adbc.fit(X_train,  y_train)

        cv_results_adbc = pd.DataFrame(model_cv_adbc.cv_results_)
        print(cv_results_adbc.sort_values('rank_test_score'))

    end = timer()
    print("Duration ",end - start)
    
    if run_cv:
        plt.figure(figsize=(20,4))
        plt.subplot(1,2,1)
        cv_results_adbc.param_n_estimators = cv_results_adbc.param_n_estimators.astype("float")
        cv_results_adbc.param_learning_rate = cv_results_adbc.param_learning_rate.astype("float")

        ax1=sns.lineplot(x='param_n_estimators',  y='mean_test_score', hue='param_learning_rate', data=cv_results_adbc)
        ax1.set_title("Test AUC Score")
        plt.subplot(1,2,2)
        ax2= sns.lineplot(x='param_n_estimators',  y='mean_train_score', hue='param_learning_rate',data=cv_results_adbc)
        ax2.set_title("Train AUC Score")
        plt.show()

    if run_cv:
        print(model_cv_adbc.best_estimator_)

In [None]:
def adaboost():
    # Optimized Hyper Parameters
    adbc = AdaBoostClassifier(learning_rate=1,random_state=100)
    adbc.fit(X_train,y_train)

    y_test_pred_adbc = adbc.predict_proba(X_test)[:,1:2]
    y_test_pred_adbc = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_adbc.reshape(-1) })

    adbc_auc_test, adbc_recall_test, adbc_precision_test, adbc_f1_test = \
                        draw_roc(y_test_pred_adbc.Class, y_test_pred_adbc.Class_Prob,True,.5)
    
    return (adbc_auc_test, adbc_recall_test, adbc_precision_test, adbc_f1_test)

### Model 11: CatboostClassifier

In [None]:
# Optimized Hyper Parameters
def catboost():
    catb_clf = CatBoostClassifier(learning_rate=1,random_state=100)
    catb_clf.fit(X_train,y_train, verbose=False)

    y_test_pred_catbc = catb_clf.predict_proba(X_test)[:,1:2]
    y_test_pred_catbc = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_catbc.reshape(-1) })

    catbc_auc_test, catbc_recall_test, catbc_precision_test, catbc_f1_test = \
                        draw_roc(y_test_pred_catbc.Class, y_test_pred_catbc.Class_Prob,True,.5)
    
    return (catbc_auc_test, catbc_recall_test, catbc_precision_test, catbc_f1_test)

### Model 12: Naive Bayes

In [None]:
### Naive Bayes
def naiveb():
    gnb = GaussianNB() 
    gnb.fit(X_train, y_train) 

    y_test_pred_gnb = gnb.predict_proba(X_test)[:,1:2]
    y_test_pred_gnb = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_gnb.reshape(-1) })

    gnb_auc_test, gnb_recall_test, gnb_precision_test, gnb_f1_test = \
                        draw_roc(y_test_pred_gnb.Class, y_test_pred_gnb.Class_Prob,True)
    
    return ( gnb_auc_test, gnb_recall_test, gnb_precision_test, gnb_f1_test)

### Model 13: Stochastic Gradient Descent Classifier

In [None]:
#Cross Validation
def sgdc_cv():
    start=timer()
    if run_cv:
        hyper_params = [{ 'alpha': [10 ** x for x in range(-3, 1)],
                        'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.9, 0.95, 1] }]
        sgd = SGDClassifier(random_state=100, class_weight='balanced',\
                            loss='hinge', penalty='elasticnet')

        model_cv_sgd = GridSearchCV(estimator=sgd, param_grid=hyper_params,
                                cv=folds, scoring="roc_auc", return_train_score=True, verbose=True)
        model_cv_sgd.fit(X_train,  y_train)

        cv_results_sgd = pd.DataFrame(model_cv_sgd.cv_results_)
        cv_results_sgd.sort_values('rank_test_score')

    end = timer()
    print("Duration ",end - start)

    if run_cv:
        print(model_cv_sgd.best_estimator_)

    #draw_cv_results(cv_results_sgd, 'param_alpha', 'Recall', 'Optimal Alpha', False)
    if run_cv:
        cv_results_sgd.param_l1_ratio = cv_results_sgd.param_l1_ratio.astype("float")
        cv_results_sgd.param_alpha = cv_results_sgd.param_alpha.astype("float")

        plt.figure(figsize=(20,4))
        plt.subplot(1,2,1)
        ax1=sns.lineplot(x= np.log(cv_results_sgd['param_alpha']), y='mean_test_score', hue='param_l1_ratio',data=cv_results_sgd)
        ax1.set_xlabel="Log Param_Alpha"
        ax1.set_title("Test AUC Score")
        plt.subplot(1,2,2)
        ax2= sns.lineplot(x=np.log(cv_results_sgd['param_alpha']), y='mean_train_score', hue='param_l1_ratio',data=cv_results_sgd)
        ax2.set_xlabel="Log Param_Alpha"
        ax2.set_title("Train AUC Score")
        plt.show()

In [None]:
#Hyperparamter Tuning
def sgdc():
    sgd = SGDClassifier(max_iter=1000, alpha=0.0001, l1_ratio=0.2, random_state=100, penalty="elasticnet", \
                        class_weight='balanced',loss='hinge', )

    sgd.fit(X_train, y_train) 

    y_test_pred_sgd = sgd.predict(X_test) #[:,1:2]
    y_test_pred_sgd = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_sgd  }) #.reshape(-1)

    sgd_auc_test, sgd_recall_test, sgd_precision_test, sgd_f1_test = \
                        draw_roc(y_test_pred_sgd.Class, y_test_pred_sgd.Class_Prob,False)
    
    return (sgd_auc_test, sgd_recall_test, sgd_precision_test, sgd_f1_test)

### Model 14:  Dense Neural Network

In [None]:
def create_dnn(indput_dim, dropout=0.2):
    model = Sequential([
    Dense(units=16, input_dim=indput_dim, activation='relu'),
    Dropout(dropout),
    Dense(units=16, activation='relu'),
    Dropout(dropout),
    Dense(1, activation='sigmoid')])
    return model

def dnn():
    dnn = create_dnn(indput_dim=X_train.shape[1], dropout=0.2)
    dnn.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
    dnn.fit(X_train, y_train,batch_size=100, epochs=50)

    y_test_pred_dnn = dnn.predict(X_test).ravel()
    y_test_pred_dnn = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_dnn  }) #.reshape(-1)

    dnn_auc_test, dnn_recall_test, dnn_precision_test, dnn_f1_test = \
                        draw_roc(y_test_pred_dnn.Class, y_test_pred_dnn.Class_Prob,True)
    
    return (dnn_auc_test, dnn_recall_test, dnn_precision_test, dnn_f1_test)


### Model 15:  Convolution Neural Network

In [None]:
def create_cnn(indput_dim, dropout=0.2):
    cnn = Sequential()
    cnn.add(Conv1D(128, kernel_size = ( 5), activation='relu', padding="same",input_shape=(30, 1) ))
    cnn.add(layers.GlobalMaxPool1D())
    cnn.add(BatchNormalization())
    
    cnn.add(Dense(30,  activation='relu'))
    cnn.add(Dense(1, activation='sigmoid'))
    return cnn

def cnn():
    xtrain = X_train.values.reshape(X_train.shape[0],X_train.shape[1],-1)
    xtest  = X_test.values.reshape(X_test.shape[0],X_test.shape[1],-1)

    cnn = create_cnn(indput_dim=xtrain.shape[1], dropout=0.2)
    cnn.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
    cnn.fit(xtrain, y_train,batch_size=5000, epochs=50)

    y_test_pred_cnn = cnn.predict(xtest).ravel()
    y_test_pred_cnn = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_cnn  }) #.reshape(-1)

    cnn_auc_test, cnn_recall_test, cnn_precision_test, cnn_f1_test = \
                        draw_roc(y_test_pred_cnn.Class, y_test_pred_cnn.Class_Prob,True)
    
    return (cnn_auc_test, cnn_recall_test, cnn_precision_test, cnn_f1_test)

# <font color=blue> Section 7: Run All Models and Save Results </font>

In [None]:
models_cv =['logistic','knn','rfc','dtc','lgbm','perceptron','svc','xgb','adaboost','sgdc']
models =['logistic','glm','knn','rfc','dtc','lgbm','perceptron','svc','xgb','adaboost',
         'catboost', 'naiveb', 'sgdc', 'dnn','cnn']

models_name = ['Logistic Regression','GLM', 'KNN', 'Random Forest',  
                      'Decision Tree', 'LGBM', 'Perceptron','SVC',
                      'XGBoost','AdaBoost', 'CatBoost','Naive Bayes', 'SGD', "DNN","CNN"]

In [None]:
#Run Models
def run_all_models(option):
    for model_no in range(len(models)):
        model = models[model_no]
        print (f"Running Model {model}")
        auc_test, recall_test, precision_test, f1_test = globals()[model]()
        results.append([options[option],models_name[model_no],auc_test, recall_test, precision_test, f1_test])

In [None]:
global option, X_train, X_test, y_train, y_test
results=[]
options = ['Normal', 'RandomOverSampler', 'Smote', 'AdaSyn']

In [None]:
#options = ['0-Normal', '1-RandomOverSampler', '2-Smote', '3-AdaSyn']
option=0
X_train, X_test, y_train, y_test = select_dataset(option)
run_all_models(option)

In [None]:
#options = ['0-Normal', '1-RandomOverSampler', '2-Smote', '3-AdaSyn']
option=1
X_train, X_test, y_train, y_test = select_dataset(option)
run_all_models(option)

In [None]:
#options = ['0-Normal', '1-RandomOverSampler', '2-Smote', '3-AdaSyn']
option=2
X_train, X_test, y_train, y_test = select_dataset(option)
run_all_models(option)

In [None]:
#options = ['0-Normal', '1-RandomOverSampler', '2-Smote', '3-AdaSyn']
option=3
X_train, X_test, y_train, y_test = select_dataset(option)
run_all_models(option)

In [None]:
final_results=pd.DataFrame(results, columns=['Data','Models','AUC','Recall','Precision','F1'])
final_results

# <font color=blue> Section 8: Comparing All the Models and all Data Imbalancing Methods</font>

#### Mearge All Model, All Metrics Results Together

In [None]:
#AUC Score
final_results.sort_values('AUC',ascending=False).head(6)
#The Best AUC score on Test data with Any model is .99 
#AUC: The Best models are based on RF, LogR, CatBoost,SVC,GLM

In [None]:
#Recall Score
final_results.sort_values('Recall',ascending=False).head(6) 

#The Best Recall score on Test data with Any model is .94.
#Recall: The Best Models are based on RF,SGD, Perceptron

In [None]:
#Precision Score
final_results.sort_values('Precision',ascending=False).head(6)
#The Best Precision score on Test data with Any model is .96. 
#Precision: The Best Models is based on KNN

In [None]:
#F1 Score
final_results.sort_values('F1',ascending=False).head(6)
#The Best F1 score on Test data with Any model is .93.
#F1: The Best Models are based on KNN, SGD

# <font color=blue> Secion 9: Final Conclusion </font>

In [None]:
#AUC: The Best models are based on RF, LogR, CatBoost,SVC,GLM
#Recall: The Best Models are based on RF,SGD, Perceptron
#Precision: The Best Models is based on KNN
#F1: The Best Models are based on KNN, SGD
#AdpativeSyn is good oversampling technique
option=3
models_final =['logistic','knn','rfc']
models_final_name = ['Logistic Regression', 'KNN', 'Random Forest']

### Section 9.1 Now train the model with LGBM algo with full dataset.

In [None]:
#Load full dataset
df1 = pd.read_csv(filename)

cols2Scale = list(df1.columns)
cols2Scale.remove("Class")

#scale fields
sc=StandardScaler()
t=sc.fit_transform(df1[ cols2Scale ])
df1[cols2Scale] = t
df1.head(5)

#Trasform fields
df1[cols2Scale] = power_transform(df1[cols2Scale], method='yeo-johnson')

X= df1.drop(columns=["Class"], axis=0)
y= df1.Class #class variable

#Split the data.
#options = ['0-Normal', '1-RandomOverSampler', '2-Smote', '3-AdaSyn']
X_train1, X_test1, y_train1, y_test1  = split_data_normal(df1)
#X_train_ros, X_test_ros, y_train_ros, y_test_ros              = split_data_ros(X_train1, X_test1, y_train1, y_test1)
#X_train_smote, X_test_smote, y_train_smote, y_test_smote      = split_data_smote(X_train1, X_test1, y_train1, y_test1)
X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn  = split_data_adasyn(X_train1, X_test1, y_train1, y_test1)

X_train, X_test, y_train, y_test = select_dataset(option)


### Section 9.2 Develop Models using Selected Algorithm and Oversampler

In [None]:
auc_test, recall_test, precision_test, f1_test =logistic()
print (f"AUC {auc_test}, Recall {recall_test}, Precision {precision_test}, F1 {f1_test}")

In [None]:
auc_test, recall_test, precision_test, f1_test =knn()
print (f"AUC {auc_test}, Recall {recall_test}, Precision {precision_test}, F1 {f1_test}")

In [None]:
auc_test, recall_test, precision_test, f1_test =rfc()
print (f"AUC {auc_test}, Recall {recall_test}, Precision {precision_test}, F1 {f1_test}")

In [None]:
auc_test, recall_test, precision_test, f1_test =lgbm()
print (f"AUC {auc_test}, Recall {recall_test}, Precision {precision_test}, F1 {f1_test}")

### Print the important features of the best model to understand the dataset
- This will not give much explanation on the already transformed dataset
- But it will help us in understanding if the dataset is not PCA transformed

In [None]:
clf = KNeighborsClassifier(n_neighbors = 4, leaf_size=30, p=2)
clf.fit(X_train, y_train)
y_test_pred_knn = clf.predict_proba(X_test)

y_test_pred_knn = pd.DataFrame({'Class':y_test.values, 'Class_Prob':y_test_pred_knn[:,1:2].reshape(-1) })


knn_auc_test, knn_recall_test, knn_precision_test,knn_f1_test = \
                        draw_roc(y_test_pred_knn.Class, y_test_pred_knn.Class_Prob)


In [None]:
# var_imp = []
# for i in clf.feature_importances_:
#     var_imp.append(i)

# top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-1])
# second_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-2])
# third_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-3])

# print('Top var =', var_imp.index(np.sort(clf.feature_importances_)[-1])+1, X_train.columns[top_var_index])
# print('2nd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-2])+1, X_train.columns[second_top_var_index])
# print('3rd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-3])+1, X_train.columns[third_top_var_index])

# X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
# X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]

# np.random.shuffle(X_train_0)

# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.rcParams['figure.figsize'] = [10, 5]

# plt.scatter(X_train_0[:X_train_1.shape[0], top_var_index], X_train_0[:X_train_1.shape[0], second_top_var_index], 
#             label='Actual Class-0 Examples', alpha=.5)
# plt.scatter( X_train_1[:, top_var_index], X_train_1[:, second_top_var_index], label='Actual Class-1 Examples', alpha=.1)
# plt.xlabel(X_train.columns[top_var_index], fontsize=14)
# plt.ylabel(X_train.columns[second_top_var_index], fontsize=14)
# plt.title("Top 2 Variables Relationship", fontsize=20)
# plt.legend()

# <font color=blue> Secion 10: Financial Benefits of the Model </font>

In [None]:
#Test Data: 71079 (normal) + 123 (fraud)
#KNN:      AUC 0.92, Recall 0.82, Precision 0.74, F1 0.78
#Logistic: AUC 0.97, Recall 0.89, Precision 0.03, F1 0.07
#RFC:      AUC 0.98, Recall 0.89, Precision 0.04, F1 0.07
#LGBM:     AUC 0.93, Recall 0.83, Precision 0.09, F1 0.16
    
#Total Fraud transactions in 2 days are 384. In one day 192 fraud transaction (average)
#Total normal transaction in 2 days are 284807. In one day 142,403 normal transations (average)
#Average fraud transaction is $122. In a 6 months FRAUD Tranactions of $122*6*30*192 = 42,16,320 i.e $ 4.2 million 
#can happen in the bank. 

#Next Six Month Approx Fraud Transactions: 34,560
#Next Six Month Approx Normal Transactions:51,265,260

#Recall score is 92% means 8% is False Negative.
#it means everyday 16 (8% of 192) fraud will be marked as normal transactions. 
#It also means 176 (92% of 192) fraud transactions can be caught using our model.

#i.e. out of 34560 fraud transactions our model can detect 31680 correctly in 6month. 
#And 2880 transaction will be False Negative. These are fraud transaction which our system fail to detect.

#Precision score is 97% means 3% is False Positive it means for every 100 fraud transactions identied by the system 
#3 are normal transaction. This can lead to customer dissatisfaction, because 3% customer feel that 
#their normal transaction was kept on hold.

#If bank call to the customer for all the transaction which are detected as fraud then only 3% cases will be 
#irritating call for the customer. Because he/she feel it is my genuine transaction.
#(176/.97)=> 181 calls every day=> 181 * 180 (days) =>  32580 calls (6 months)
#if Cost of each call is Rs 10 (assuming call centre in India) then cost of making 
#Then cost of making these call is Rs. 3,25,800. It can solve 92% fraud related problems
#For other normal transactions bank can call to the customer as per their policy based on random samplng and calling.
#In this they will be able to identify some of those 16 transactions which model couldn't detect.