In [None]:
#importing all the modules
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Importing the DataSet
data_set = pd.read_csv('../input/creditcard/creditcard.csv')
data_set

***Exploaratory Analysis***

In [None]:
# Checking the shape of the Dataset
data_set.shape

In [None]:
# Checking if there's any null/missing values in columns
round(100 * (data_set.isnull().sum()/len(data_set)),2).sort_values(ascending=False)

In [None]:
data_set.describe()

In [None]:
# Making Histogram for each feature
data_set.hist(figsize=(20,20))
plt.show()

In [None]:
# Getting Count Value for all th classes i.e. {1 : Fraud, 0 : Not Fraud}
data_set.Class.value_counts()

In [None]:
# Getting a sample from the DataSet... Since 
data = data_set.sample(frac=0.1, random_state = 1)
data.shape

In [None]:
def plot_2d_space(X, y, label='Classes'):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(
            X[y==l, 0],
            X[y==l, 1],
            c=c, label=l, marker=m
        )
    plt.title(label)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
ax=sns.countplot(x='Class',data=data);
ax.set_yscale('log')

In [None]:
plt.figure(figsize=(30,10))
sns.heatmap(data.corr(), annot = True, cmap="tab20b")
plt.show()

In [None]:
y=data['Class']
x=data.drop(['Class'], axis=1)
plot_2d_space(np.array(x), np.array(y), 'Imbalanced Data Sample')
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,stratify=y)

In [None]:
stomek = SMOTETomek (sampling_strategy="all")
x_stomek, y_stomek = stomek.fit_resample(x_train, y_train)
x_stomek.shape,y_stomek.shape

In [None]:
plot_2d_space(np.array(x_stomek), np.array(y_stomek), 'SMOTE + Tomek Links')

***Logistic Regression with Bagging Classifier***

In [None]:
bblr=BalancedBaggingClassifier(base_estimator=LogisticRegression(),n_estimators=50,random_state=0,n_jobs=3)
bblr.fit(x_stomek, y_stomek)
y_pred_bb=bblr.predict(x_test)
cm=confusion_matrix(y_test, y_pred_bb)
sns.heatmap(cm, annot=True,
                    cmap=plt.cm.Blues,
                   xticklabels=['No Fraud', 'Fraud'],
                   yticklabels=['No Fraud', 'Fraud']).set_title('Balanced Bagging - LR')

In [None]:
print(classification_report(y_test,y_pred_bb))
print('Accuracy ==> ',accuracy_score(y_test,y_pred_bb)*100)

In [None]:
#for logistic regression
TN=cm[0,0] #true negative
TP=cm[1,1] #true positive
FN=cm[1,0] #flase negative
FP=cm[0,1] #false positive
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)
print('The accuracy of the model = TP+TN/(TP+TN+FP+FN) =',(TP+TN)/float(TP+TN+FP+FN),'\n',

'The Missclassification = 1-Accuracy \t\t=',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',

'Sensitivity or True Positive Rate = TP/(TP+FN) =',TP/float(TP+FN),'\n',

'Specificity or True Negative Rate = TN/(TN+FP) =',TN/float(TN+FP),'\n',

'Positive Predictive value = TP/(TP+FP) \t=',TP/float(TP+FP),'\n',

'Negative predictive Value = TN/(TN+FN) \t=',TN/float(TN+FN),'\n',

'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',

'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)

***K-Nearest Neighbors with Bagging Classifier***

In [None]:
bbknn=BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),n_estimators=50,random_state=0,n_jobs=3)
bbknn.fit(x_stomek, y_stomek)
y_pred_bbk=bbknn.predict(x_test)
cm1=confusion_matrix(y_test, y_pred_bbk)
sns.heatmap(cm1, annot=True,
                    cmap=plt.cm.Blues,
                   xticklabels=['No Fraud', 'Fraud'],
                   yticklabels=['No Fraud', 'Fraud']).set_title('Balanced Bagging - KNN')

In [None]:
print(classification_report(y_test,y_pred_bbk))
print('Accuracy ==> ',accuracy_score(y_test,y_pred_bbk)*100)

In [None]:
#for K-nearest neighbors
TN=cm1[0,0] #true negative
TP=cm1[1,1] #true positive
FN=cm1[1,0] #flase negative
FP=cm1[0,1] #false positive
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)
print('The accuracy of the model = TP+TN/(TP+TN+FP+FN) =',(TP+TN)/float(TP+TN+FP+FN),'\n',

'The Missclassification = 1-Accuracy \t\t=',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',

'Sensitivity or True Positive Rate = TP/(TP+FN) =',TP/float(TP+FN),'\n',

'Specificity or True Negative Rate = TN/(TN+FP) =',TN/float(TN+FP),'\n',

'Positive Predictive value = TP/(TP+FP) \t=',TP/float(TP+FP),'\n',

'Negative predictive Value = TN/(TN+FN) \t=',TN/float(TN+FN),'\n',

'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',

'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)

***Random Forest Algorithm***

In [None]:
rfc_model= RandomForestClassifier(n_estimators = 100)
rfc_model.fit(x_stomek,y_stomek)
y_pred_rfc=rfc_model.predict(x_test)
cm2=confusion_matrix(y_test, y_pred_rfc)
sns.heatmap(cm2, annot=True,
                    cmap=plt.cm.Blues,
                   xticklabels=['No Fraud', 'Fraud'],
                   yticklabels=['No Fraud', 'Fraud']).set_title('Balanced Bagging - Random forest')

In [None]:
print(classification_report(y_test,y_pred_rfc))
print('Accuracy ==> ',accuracy_score(y_test,y_pred_rfc)*100)

In [None]:
#for Random Forest Algorithm
TN=cm2[0,0] #true negative
TP=cm2[1,1] #true positive
FN=cm2[1,0] #flase negative
FP=cm2[0,1] #false positive
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)
print('The accuracy of the model = TP+TN/(TP+TN+FP+FN) =',(TP+TN)/float(TP+TN+FP+FN),'\n',

'The Missclassification = 1-Accuracy \t\t=',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',

'Sensitivity or True Positive Rate = TP/(TP+FN) =',TP/float(TP+FN),'\n',

'Specificity or True Negative Rate = TN/(TN+FP) =',TN/float(TN+FP),'\n',

'Positive Predictive value = TP/(TP+FP) \t=',TP/float(TP+FP),'\n',

'Negative predictive Value = TN/(TN+FN) \t=',TN/float(TN+FN),'\n',

'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',

'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)

**Final Results**
> Accuracy
* Logistic Regression ==> 0.9852545348156817
* K-Nearest Neighbors ==> 0.8482153306026916
* Random  Forest Algo ==> 0.9992978349912229

> Best Algorithm for the Credit Card Fraud Detection System ==> Random Forest Algorithm
