# Credit Card Fraud Detection using Machine Learning

### Import libraries

In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import keras
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Dropout, Flatten, Activation, Input
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = (10, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

### Import dataset (https://www.kaggle.com/mlg-ulb/creditcardfraud)

In [2]:
df = pd.read_csv('../creditcard.csv')
df = df.dropna()
df = df.drop('Time', axis = 1)

### Investigate Class Sizes

In [3]:
groups = df.groupby('Class')

fraud = (groups.get_group(1).shape[0] / df.shape[0]) * 100
non_fraud = (groups.get_group(0).shape[0] / df.shape[0]) * 100

print('Percent Fraud: ' + str(fraud) + '%')
print('Percent Not Fraud ' + str(non_fraud) + '%')

Percent Fraud: 0.1727485630620034%
Percent Not Fraud 99.82725143693798%


### Split data into a train and holdout set

In [4]:
df_size = df.shape[0]
test_size = int(df_size * .3)
train_size = df_size - test_size

train_df = df.head(train_size)
test_df = df.tail(test_size)

X_train = train_df.drop('Class', axis = 1)
Y_train = train_df['Class']
X_test = test_df.drop('Class', axis = 1)
Y_test = test_df['Class']

### Apply a standard scalar to our data 

In [None]:
for feat in X_train.columns.values:
    ss = StandardScaler()
    X_train[feat] = ss.fit_transform(X_train[feat].values.reshape(-1,1))
    X_test[feat] = ss.transform(X_test[feat].values.reshape(-1,1))

### Fit Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

probabilities = rf.predict_proba(X_test)
y_pred_rf = probabilities[:,1]

### Evaluate Performance

In [None]:
fpr_rf, tpr_rf, thresholds_rf = roc_curve(Y_test, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)

plt.plot(100*fpr_rf, 100*tpr_rf, label= 'Random Forest (area = {:.3f})'.format(auc_rf), linewidth=2, color = colors[0])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([0,30])
plt.ylim([60,100])
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
plt.title('Random Forest Model Performance')
plt.legend(loc='best')

### Fit CatBoost Classifier

In [None]:
cat = CatBoostClassifier()
cat.fit(X_train, Y_train)
y_pred_cat = cat.predict(X_test, prediction_type='RawFormulaVal')

### Evaluate performacne

In [None]:
fpr_cat, tpr_cat, thresholds_cat = roc_curve(Y_test, y_pred_cat)
auc_cat = auc(fpr_cat, tpr_cat)

plt.plot(100*fpr_cat, 100*tpr_cat, label= 'CatBoost (area = {:.3f})'.format(auc_cat), linewidth=2, color = colors[1])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([0,30])
plt.ylim([60,100])
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
plt.title('CatBoost Model Performance')
plt.legend(loc='best')

### Design and fit Deep Neural Network

In [None]:
#Design and compile model
DNN = Sequential()
DNN.add(Input(shape=(X_train.shape[1],)))
DNN.add(Dense(100, activation='relu'))
DNN.add(Dropout(0.5))
DNN.add(Dense(100, activation='relu'))
DNN.add(Dropout(0.5))
DNN.add(Dense(10, activation='relu'))
DNN.add(Dense(1, activation='sigmoid'))
DNN.compile(loss='binary_crossentropy', optimizer='adam', metrics = keras.metrics.AUC(name='auc'))

#fit model
DNN.fit(X_train, Y_train, epochs=10)

#generate prediction probabilities on test data
y_pred_DNN = DNN.predict(X_test).ravel()

### Evaluate Performance

In [None]:
fpr_DNN, tpr_DNN, thresholds_DNN = roc_curve(Y_test, y_pred_DNN)
auc_DNN = auc(fpr_DNN, tpr_DNN)

plt.plot(100*fpr_DNN, 100*tpr_DNN, label= 'DNN (area = {:.3f})'.format(auc_DNN), linewidth=2, color = colors[2])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([0,30])
plt.ylim([60,100])
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
plt.title('Deep Neural Network Model Performance')
plt.legend(loc='best')

### Fit Isolation Forest

In [None]:
iforest = IsolationForest()
iforest.fit(X_train)

y_pred_iforest = - iforest.decision_function(X_test)

### Evaulate Performance

In [None]:
fpr_iforest, tpr_iforest, thresholds__iforest = roc_curve(Y_test, y_pred_iforest)
auc_iforest = auc(fpr_iforest, tpr_iforest)

plt.plot(100*fpr_iforest, 100*tpr_iforest, label= 'iForest (area = {:.3f})'.format(auc_iforest), linewidth=2, color = colors[3])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([0,30])
plt.ylim([60,100])
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
plt.title('Isolation Forest Model Performance')
plt.legend(loc='best')

### Compare performance across all models

In [None]:
plt.plot(100*fpr_rf, 100*tpr_rf, label= 'Random Forest (area = {:.3f})'.format(auc_rf), linewidth=2, color = colors[0])
plt.plot(100*fpr_cat, 100*tpr_cat, label= 'CatBoost (area = {:.3f})'.format(auc_cat), linewidth=2, color = colors[1])
plt.plot(100*fpr_DNN, 100*tpr_DNN, label= 'DNN (area = {:.3f})'.format(auc_DNN), linewidth=2, color = colors[2])
plt.plot(100*fpr_iforest, 100*tpr_iforest, label= 'iForest (area = {:.3f})'.format(auc_iforest), linewidth=2, color = colors[3])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([0,30])
plt.ylim([60,100])
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
plt.title('Model Comparison')
plt.legend(loc='best')