## Autoencoder

In [None]:
import pandas as pd 
import numpy as np 
import pickle 
import matplotlib.pyplot as plt 
import tensorflow as tf 

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report, f1_score, precision_recall_fscore_support

In [None]:
%matplotlib inline 
sns.set(style = 'darkgrid', palette = 'pastel', font_scale = 1.5)
LABELS = ['Non-Fraud', 'Fraud']

### Data

In [None]:
data = pd.read_csv('data_creditCardFraud.csv')
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data.head()

In [None]:
# Splitting Data to Train and Test
# We also drop feature time because it is different from every transacion and does not 
y = data.Class
X = data.drop(['Class','Time'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=777)

### Model

Autoencoder uses 4 fully connected layers:

14 neurons (encoder)
7 neurons (encoder)
7 neurons (decoder)
29 neurons (decoder)
Usage of L1 regularization.

TRAINING THE MODEL

number of epochs = 100
batch_size = 2048

In [None]:
input_dim = X_train.shape[1] # 29
encoding_dim = 14 
input_layer = Input(shape = (input_dim, ))
encoder = Dense(encoding_dim, activation = 'tanh',
               activity_regularizer = regularizers.l1(10e-5))(input_layer)
print(encoder)
encoder = Dense(int(encoding_dim/2), activation = 'relu')(encoder)
print(encoder)
decoder = Dense(int(encoding_dim/2), activation = 'tanh')(encoder)
print(decoder)
decoder = Dense(input_dim, activation = 'relu')(decoder)
print(decoder)
autoencoder = Model(inputs = input_layer, outputs = decoder)
print(autoencoder)

In [None]:
nb_epochs = 100
batch_size = 2048 # 
autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error', 
                   metrics = ['accuracy'])
checkpointer = ModelCheckpoint(filepath = 'autoencoder_creditCardFraud.h5',
                              verbose = 0,
                              save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs',
                         histogram_freq = 0, 
                         write_graph = True,
                         write_images = True)

In [None]:
history = autoencoder.fit(X_train, X_train, epochs = nb_epochs, 
                         batch_size = batch_size,
                         shuffle = True,
                         validation_data = (X_val, X_val),
                         verbose = 1,
                         callbacks = [checkpointer, tensorboard]).history

In [None]:
autoencoder.summary()

In [None]:
autoencoder.save('creditCardFraud_autoencoder_originalData.h5')

In [None]:
predictions = autoencoder.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions, 2), axis = 1)
error_df = pd.DataFrame({'reconstruction_error':mse, 'true_class':y_test})
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
error_df.describe()

In [None]:
threshold = 2.5
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

### Undersampled

In [None]:
und = pd.read_csv("cardFraud_X_train_undersampled.csv")
und_y = pd.read_csv("cardFraud_Y_train_undersampled.csv")
und['Class'] = und_y


print(len(und))
print(und['Class'].value_counts())
und.head()

In [None]:
y_und = und.Class
X_und = und.drop(['Class'], axis=1)
X_train_und, X_val_und, y_train_und, y_val_und = train_test_split(X_und, y_und, test_size=0.2, random_state=777)

In [None]:
input_dim = X_train_und.shape[1] # 29
encoding_dim = 14 
input_layer_und = Input(shape = (input_dim, ))
encoder_und = Dense(encoding_dim, activation = 'tanh',
               activity_regularizer = regularizers.l1(10e-5))(input_layer_und)
print(encoder_und)
encoder_und = Dense(int(encoding_dim/2), activation = 'relu')(encoder_und)
print(encoder_und)
decoder_und = Dense(int(encoding_dim/2), activation = 'tanh')(encoder_und)
print(decoder_und)
decoder_und = Dense(input_dim, activation = 'relu')(decoder_und)
print(decoder_und)
autoencoder_und = Model(inputs = input_layer_und, outputs = decoder_und)
print(autoencoder_und)

In [None]:
nb_epochs = 100
batch_size = 64 # 
autoencoder_und.compile(optimizer = 'adam', loss = 'mean_squared_error', 
                   metrics = ['accuracy'])
checkpointer = ModelCheckpoint(filepath = 'autoencoder_creditCardFraud.h5',
                              verbose = 0,
                              save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs',
                         histogram_freq = 0, 
                         write_graph = True,
                         write_images = True)

In [None]:
history = autoencoder_und.fit(X_train_und, X_train_und, epochs = nb_epochs, 
                         batch_size = batch_size,
                         shuffle = True,
                         validation_data = (X_val_und, X_val_und),
                         verbose = 1,
                         callbacks = [checkpointer, tensorboard]).history

In [None]:
autoencoder_und.save('creditCardFraud_autoencoder_und.h5')

In [None]:
predictions_und = autoencoder_und.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions_und, 2), axis = 1)
error_df = pd.DataFrame({'reconstruction_error':mse, 'true_class':y_test})
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
error_df.describe()

In [None]:
threshold = 2.5
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

### Oversampled

In [None]:
over = pd.read_csv("cardFraud_X_train_oversampled.csv")
over_y = pd.read_csv("cardFraud_Y_train_oversampled.csv")
over['Class'] = over_y

print(len(over))
print(over['Class'].value_counts())
over.head()

In [None]:
y_over = over.Class
X_over = over.drop(['Class'], axis=1)
X_train_over, X_val_over, y_train_over, y_val_over = train_test_split(X_over, y_over, test_size=0.2, random_state=777)

In [None]:
input_dim = X_train_over.shape[1] # 29
encoding_dim = 14 
input_layer_over = Input(shape = (input_dim, ))
encoder_over = Dense(encoding_dim, activation = 'tanh',
               activity_regularizer = regularizers.l1(10e-5))(input_layer_over)
print(encoder_over)
encoder_over = Dense(int(encoding_dim/2), activation = 'relu')(encoder_over)
print(encoder_over)
decoder_over = Dense(int(encoding_dim/2), activation = 'tanh')(encoder_over)
print(decoder_over)
decoder_over = Dense(input_dim, activation = 'relu')(decoder_over)
print(decoder_over)
autoencoder_over = Model(inputs = input_layer_over, outputs = decoder_over)
print(autoencoder_over)

In [None]:
nb_epochs = 100
batch_size = 2048 # 
autoencoder_over.compile(optimizer = 'adam', loss = 'mean_squared_error', 
                   metrics = ['accuracy'])
checkpointer = ModelCheckpoint(filepath = 'creditCardFraud_over.h5',
                              verbose = 0,
                              save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs',
                         histogram_freq = 0, 
                         write_graph = True,
                         write_images = True)

In [None]:
history = autoencoder_over.fit(X_train_over, X_train_over, epochs = nb_epochs, 
                         batch_size = batch_size,
                         shuffle = True,
                         validation_data = (X_val_over, X_val_over),
                         verbose = 1,
                         callbacks = [checkpointer, tensorboard]).history

In [None]:
autoencoder_over.save('creditCardFraud_autoencoder_over.h5')

In [None]:
predictions_over = autoencoder_over.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions_over, 2), axis = 1)
error_df = pd.DataFrame({'reconstruction_error':mse, 'true_class':y_test})
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
error_df.describe()

In [None]:
threshold = 2.5
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

### Oversampled v2

In [None]:
over2 = pd.read_csv("cardFraud_X_train_oversampled_v2.csv")
over2_y = pd.read_csv("cardFraud_Y_train_oversampled_v2.csv")
over2['Class'] = over2_y

print(len(over2))
print(over2['Class'].value_counts())
over2.head()

In [None]:
y_over2 = over2.Class
X_over2 = over2.drop(['Class'], axis=1)
X_train_over2, X_val_over2, y_train_over2, y_val_over2 = train_test_split(X_over2, y_over2, test_size=0.2, random_state=777)

In [None]:
input_dim = X_train_over2.shape[1] # 29
encoding_dim = 14 
input_layer_over2 = Input(shape = (input_dim, ))
encoder_over2 = Dense(encoding_dim, activation = 'tanh',
               activity_regularizer = regularizers.l1(10e-5))(input_layer_over2)
print(encoder_over2)
encoder_over2 = Dense(int(encoding_dim/2), activation = 'relu')(encoder_over2)
print(encoder_over2)
decoder_over2 = Dense(int(encoding_dim/2), activation = 'tanh')(encoder_over2)
print(decoder_over2)
decoder_over2 = Dense(input_dim, activation = 'relu')(decoder_over2)
print(decoder_over2)
autoencoder_over2 = Model(inputs = input_layer_over2, outputs = decoder_over2)
print(autoencoder_over2)

In [None]:
nb_epochs = 100
batch_size = 2048 # 
autoencoder_over2.compile(optimizer = 'adam', loss = 'mean_squared_error', 
                   metrics = ['accuracy'])
checkpointer = ModelCheckpoint(filepath = 'creditCardFraud_over2.h5',
                              verbose = 0,
                              save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs',
                         histogram_freq = 0, 
                         write_graph = True,
                         write_images = True)

In [None]:
history = autoencoder_over2.fit(X_train_over2, X_train_over2, epochs = nb_epochs, 
                         batch_size = batch_size,
                         shuffle = True,
                         validation_data = (X_val_over2, X_val_over2),
                         verbose = 1,
                         callbacks = [checkpointer, tensorboard]).history

In [None]:
autoencoder_over2.save('creditCardFraud_autoencoder_over2.h5')

In [None]:
predictions_over2 = autoencoder_over2.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions_over2, 2), axis = 1)
error_df = pd.DataFrame({'reconstruction_error':mse, 'true_class':y_test})
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
error_df.describe()

In [None]:
threshold = 2.5
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

### Oversampled SMOTE

In [None]:
sin = pd.read_csv("cardFraud_X_train_oversampled_syntethic.csv")
sin_y = pd.read_csv("cardFraud_Y_train_oversampled_syntethic.csv")
sin['Class'] = sin_y

print(len(sin))
print(sin['Class'].value_counts())
sin.head()

In [None]:
y_sin = sin.Class
X_sin = sin.drop(['Class'], axis=1)
X_train_sin, X_val_sin, y_train_sin, y_val_sin = train_test_split(X_sin, y_sin, test_size=0.2, random_state=777)

In [None]:
input_dim = X_train_sin.shape[1] # 29
encoding_dim = 14 
input_layer_sin = Input(shape = (input_dim, ))
encoder_sin = Dense(encoding_dim, activation = 'tanh',
               activity_regularizer = regularizers.l1(10e-5))(input_layer_sin)
print(encoder_sin)
encoder_sin = Dense(int(encoding_dim/2), activation = 'relu')(encoder_sin)
print(encoder_sin)
decoder_sin = Dense(int(encoding_dim/2), activation = 'tanh')(encoder_sin)
print(decoder_sin)
decoder_sin = Dense(input_dim, activation = 'relu')(decoder_sin)
print(decoder_sin)
autoencoder_sin = Model(inputs = input_layer_sin, outputs = decoder_sin)
print(autoencoder_sin)

In [None]:
nb_epochs = 100
batch_size = 2048 # 
autoencoder_sin.compile(optimizer = 'adam', loss = 'mean_squared_error', 
                   metrics = ['accuracy'])
checkpointer = ModelCheckpoint(filepath = 'creditCardFraud_sin.h5',
                              verbose = 0,
                              save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs',
                         histogram_freq = 0, 
                         write_graph = True,
                         write_images = True)

In [None]:
history = autoencoder_sin.fit(X_train_sin, X_train_sin, epochs = nb_epochs, 
                         batch_size = batch_size,
                         shuffle = True,
                         validation_data = (X_val_sin, X_val_sin),
                         verbose = 1,
                         callbacks = [checkpointer, tensorboard]).history

In [None]:
autoencoder_sin.save('creditCardFraud_autoencoder_sin.h5')

In [None]:
predictions_sin = autoencoder_sin.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions_sin, 2), axis = 1)
error_df = pd.DataFrame({'reconstruction_error':mse, 'true_class':y_test})
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
error_df.describe()

In [None]:
threshold = 2.5
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

### Explanations

Explanations algorithms are the same that the jn creditCardFraud_tfNN has because both are keras models.