#### This notebook looks at the fraud dataset available on kaggle: 
https://www.kaggle.com/mlg-ulb/creditcardfraud
    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from itertools import product

In [None]:
fraud = pd.read_csv('creditcard.csv')

In [None]:
fraud.shape

In [None]:
fraud.head()

In [None]:
print('percent fraud in dataset', round(fraud.Class.value_counts()[1] / fraud.Class.value_counts()[0] * 100, 2), 'percent')

In [None]:
v_features = fraud.ix[:,2:4].columns

In [None]:
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(fraud[v_features]):
    ax = plt.subplot(gs[i])
    sns.distplot(fraud[cn][fraud.Class == 1], bins=50)
    sns.distplot(fraud[cn][fraud.Class == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

In [None]:
# Create dataframes of only Fraud and Normal transactions.
fraud_samples = fraud[fraud.Class == 1]
normal = fraud[fraud.Class == 0]

In [None]:
sample = fraud_samples.sample(frac=0.03, random_state=0)
sample = pd.concat([sample, normal.sample(frac = 0.03, random_state=0)], axis = 0)

In [None]:
rest_fraud = fraud_samples.iloc[~fraud_samples.index.isin(sample.index,), :]
rest_normal = normal.iloc[~normal.index.isin(sample.index,), :]

In [None]:
test = rest_fraud.sample(frac=0.03, random_state=0)
test = pd.concat([test, rest_normal.sample(frac = 0.03, random_state=0)], axis = 0)

In [None]:
print('total number of samples:', len(sample), 
      '\npercent fraud in dataset', round(sample.Class.value_counts()[1] / sample.Class.value_counts()[0] * 100, 2), 'percent')

In [None]:
print('total number of samples:', len(test), 
      '\npercent fraud in dataset', round(test.Class.value_counts()[1] / test.Class.value_counts()[0] * 100, 2), 'percent')

In [None]:
y = sample.pop('Class')
X = sample

In [None]:
X = X[['V2', 'V3']]

In [None]:
y_test = test.pop('Class')
X_test = test

In [None]:
X_test = X_test[['V2', 'V3']]

## not a very intelligent classifier

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='rbf', probability=True)

In [None]:
print('fitting SVM..')
clf.fit(X, y)

In [None]:
# Plotting decision regions
x_min, x_max = X.values[:, 0].min() - 1, X.values[:, 0].max() + 1
y_min, y_max = X.values[:, 1].min() - 1, X.values[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, ax = plt.subplots(1, 1, figsize=(8, 8))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

ax.contourf(xx, yy, Z, alpha=0.4)
ax.scatter(X.values[:, 0], X.values[:, 1], c=y, s=20, edgecolor='k')
ax.set_title('Kernel SVM')

plt.show()

## look at confusion matrix and roc curve

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, classification_report
from sklearn.metrics import auc

In [None]:
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))

In [None]:
plt.figure()
sns.heatmap(conf_matrix, annot=True)

In [None]:
1-0.0017

In [None]:
print('accuracy:', accuracy_score(y_test, clf.predict(X_test)))
print('recall:', recall_score(y_test, clf.predict(X_test)))
print('precision:', precision_score(y_test, clf.predict(X_test)))

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y_test.values.ravel(), pd.DataFrame(clf.predict_proba(X_test))[1].values.ravel())
roc_auc = auc(fpr, tpr)

In [None]:
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc, alpha=0.5)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## resampling using smote

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=0)
sm = SMOTE(k_neighbors=1, ratio='minority')

In [None]:
kind = [ros, sm]

In [None]:
def plot_decision_boundaries(clf, X_sampled):
    # Plotting decision regions
    x_min, x_max = X_sampled[:, 0].min() - 1, X_sampled[:, 0].max() + 1
    y_min, y_max = X_sampled[:, 1].min() - 1, X_sampled[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    return xx, yy, Z

In [None]:
fig, axs = plt.subplots(2,2, figsize=(12, 12))
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()

count = 0
for each in kind:
    print(each)
    X_sampled, y_sampled = each.fit_sample(X, y)
    
    print('fitting SVM..')
    clf.fit(X_sampled, y_sampled)
    
    xx, yy, Z = plot_decision_boundaries(clf, X_sampled)
    axs[count].contourf(xx, yy, Z, alpha=0.4)
    axs[count].scatter(X.values[:, 0], X.values[:, 1], c=y, s=20, edgecolor='k', alpha=0.2)
    axs[count].set_title('Kernel SVM ')
    
    count +=1
    axs[count].scatter(X_sampled[:, 0], X_sampled[:, 1], c=y_sampled, s=20, edgecolor='k', alpha=0.2)
    axs[count].set_title('Kernel SVM ')
    
    count +=1

In [None]:
conf_matrix = confusion_matrix(y_test, clf.predict(X_test))

In [None]:
plt.figure()
sns.heatmap(conf_matrix, annot=True)

In [None]:
print('accuracy:', accuracy_score(y_test, clf.predict(X_test)))
print('recall:', recall_score(y_test, clf.predict(X_test)))
print('precision:', precision_score(y_test, clf.predict(X_test)))

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y_test.values.ravel(), pd.DataFrame(clf.predict_proba(X_test))[1].values.ravel())
roc_auc = auc(fpr, tpr)

In [None]:
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f'% roc_auc, alpha=0.5)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()