## Credit Card Fraud Detection

In this notebook we will learn
- Treating Imbalanced Dataset using SMOTE, ADASYN
- 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import warnings
from imblearn.over_sampling import SMOTE, ADASYN
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

## Treating Imbalance in the data

In [3]:
# Download dataset from https://drive.google.com/file/d/1aOrE9cD1qo6SfamSWrYJJBuLhvM_C8-Y/view?usp=sharing
df = pd.read_csv("data/creditcard.csv")

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
y = df.pop('Class')
X = df
y.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [7]:
X_train,X_test,y_train,y_test =train_test_split(X,y, stratify=y, random_state=42)
print(np.sum(y))
print(np.sum(y_train))
print(np.sum(y_test))

492
369
123


## Training with Logistic Regression

In [8]:
num_cv_splits = 3
num_C = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
auc = np.zeros((len(num_C), num_cv_splits))
skf = StratifiedKFold(n_splits=num_cv_splits, random_state=0, shuffle=True)
cv_num=0
for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    for c_id, c_val in enumerate(num_C):
        lr = LogisticRegression(C=c_val, random_state=0, solver='lbfgs', max_iter=1000, n_jobs=-1)
        lr.fit(X_train_cv, y_train_cv)
        auc[c_id, cv_num] = metrics.roc_auc_score(y_true = y_test_cv, y_score = lr.predict_proba(X_test_cv)[:,1])
        print('C_val =', c_val, '; auc =', auc[c_id, cv_num], '; cv_num =', cv_num)
    cv_num += 1
mean_auc = np.mean(auc, axis=1)
print('Best C:', num_C[np.argmax(mean_auc)])
print('Best auc corresponding to Best C:', mean_auc[np.argmax(mean_auc)])



C_val = 0.001 ; auc = 0.9812518236607681 ; cv_num = 0
C_val = 0.01 ; auc = 0.9635138595930762 ; cv_num = 0
C_val = 0.1 ; auc = 0.9627261182078752 ; cv_num = 0
C_val = 1.0 ; auc = 0.9574895310004887 ; cv_num = 0
C_val = 10.0 ; auc = 0.9621422036193097 ; cv_num = 0
C_val = 100.0 ; auc = 0.9616811341371337 ; cv_num = 0
C_val = 1000.0 ; auc = 0.962404250303424 ; cv_num = 0
C_val = 0.001 ; auc = 0.9853903540512635 ; cv_num = 1
C_val = 0.01 ; auc = 0.9339944321656528 ; cv_num = 1
C_val = 0.1 ; auc = 0.93778398637403 ; cv_num = 1
C_val = 1.0 ; auc = 0.9395143409079809 ; cv_num = 1
C_val = 10.0 ; auc = 0.9330388939731208 ; cv_num = 1
C_val = 100.0 ; auc = 0.9401262788215607 ; cv_num = 1
C_val = 1000.0 ; auc = 0.9524119332697146 ; cv_num = 1
C_val = 0.001 ; auc = 0.914452163739961 ; cv_num = 2
C_val = 0.01 ; auc = 0.9367049413480713 ; cv_num = 2
C_val = 0.1 ; auc = 0.9403779930761968 ; cv_num = 2
C_val = 1.0 ; auc = 0.9407851948746562 ; cv_num = 2
C_val = 10.0 ; auc = 0.9408927144506538 ; cv_nu

## Training with XGBoost

In [11]:
tree_ranges = range(2,50,5)
score1 = []
score2 = []
auc = np.zeros((len(tree_ranges), num_cv_splits))
cv_num = 0
for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    X_train_cv, y_train_cv = ADASYN().fit_resample(X_train_cv, y_train_cv)
    for tree_id, tree in enumerate(tree_ranges):
        classifier = XGBClassifier(n_estimators=tree)
        classifier.fit(X_train_cv, y_train_cv)
        auc[tree_id, cv_num] = metrics.roc_auc_score(y_true = y_test_cv, y_score = classifier.predict_proba(X_test_cv)[:,1])
        print('num_trees =', tree, '; auc =', auc[tree_id, cv_num], '; cv_num =', cv_num)
    cv_num += 1
mean_auc = np.mean(auc, axis=1)
print('Best num_trees:', tree_ranges[np.argmax(mean_auc)])
print('Best auc corresponding to Best num_trees :', mean_auc[np.argmax(mean_auc)])

num_trees = 2 ; auc = 0.9557044451970708 ; cv_num = 0
num_trees = 7 ; auc = 0.9659545196304535 ; cv_num = 0
num_trees = 12 ; auc = 0.9672911750431817 ; cv_num = 0
num_trees = 17 ; auc = 0.9642214199544603 ; cv_num = 0
num_trees = 22 ; auc = 0.965666108144642 ; cv_num = 0
num_trees = 27 ; auc = 0.967267097859853 ; cv_num = 0
num_trees = 32 ; auc = 0.9741979524214268 ; cv_num = 0
num_trees = 37 ; auc = 0.9764016723862845 ; cv_num = 0
num_trees = 42 ; auc = 0.9765484802950843 ; cv_num = 0
num_trees = 47 ; auc = 0.978283295684854 ; cv_num = 0
num_trees = 2 ; auc = 0.9317083007490693 ; cv_num = 1
num_trees = 7 ; auc = 0.9477318664209307 ; cv_num = 1
num_trees = 12 ; auc = 0.956512603576211 ; cv_num = 1
num_trees = 17 ; auc = 0.9670069956513518 ; cv_num = 1
num_trees = 22 ; auc = 0.9703886675046214 ; cv_num = 1
num_trees = 27 ; auc = 0.9725778039023795 ; cv_num = 1
num_trees = 32 ; auc = 0.9776206870244113 ; cv_num = 1
num_trees = 37 ; auc = 0.9755509643054899 ; cv_num = 1
num_trees = 42 ; a

In [12]:
xgb = XGBClassifier(n_estimators=47)
X_train_smote, y_train_smote = SMOTE(random_state=0).fit_resample(X_train, y_train)
xgb.fit(X_train_smote, y_train_smote)
xgb.score(X_test, y_test)
metrics.roc_auc_score(y_true = y_test, y_score = classifier.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = metrics.roc_curve(y_train_smote, classifier.predict_proba(X_train_smote)[:,1])
threshold = thresholds[np.argmax(tpr-fpr)]
print(threshold)

0.020382658
