# Credit Card Fraud Detection
### Based on the Kaggle [Fraud Detection Data](https://www.kaggle.com/code/zwhjorth/dnn-svm-and-dt-for-fraud-detection)

In [None]:
# Install all requirements needed to train this model and track it in MLFlow.
!pip install pip -qU
!pip install -r requirements.txt -q

In [None]:
# Import the dependencies we need to run the code.

import os
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from scikit-learn.model_selection import train_test_split
from scikit-learn.preprocessing import StandardScaler
from scikit-learn.utils import class_weight
from scikit-learn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
import tf2onnx

import mlflow
from scikit-learn.linear_model import LogisticRegression

import keras

### Get the external route for the MLFlow Server

In [None]:
# Get the environment variable for how to reach MLFlow.
# The general format is http://<route-to-mlflow>:<port> where you replace <route-to-mlflow> and <port> with specifics for where your MLFlow instance is set up and/or exposed.

MLFLOW_ROUTE = os.getenv("MLFLOW_ROUTE")

### Load the data into a pandas dataframe.

In [None]:
# Load the CSV data which we will use to train the model.
# It contains the following fields:
#   distancefromhome - The distance from home where the transaction happened.
#   distancefromlast_transaction - The distance from last transaction happened.
#   ratiotomedianpurchaseprice - Ratio of purchased price compared to median purchase price.
#   repeat_retailer - If it's from a retailer that already has been purchased from before.
#   used_chip - If the (credit card) chip was used.
#   usedpinnumber - If the PIN number was used.
#   online_order - If it was an online order.
#   fraud - If the transaction is fraudulent.

Data = pd.read_csv('../data/card_transdata.csv')
Data.head()

### Split the data into training and test sets.

In [None]:
# Set the input (X) and output (Y) data. 
# The only output data we have is if it's fraudulent or not, and all other fields go as inputs to the model.

X = Data.drop(columns = ['fraud'])
y = Data['fraud']

# Split the data into training and testing sets so we have something to test the trained model with.

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.2, stratify = y_train)

# Scale the data to remove mean and have unit variance. This means that the data will be between -1 and 1, which makes it a lot easier for the model to learn than random potentially large values.
# It is important to only fit the scaler to the training data, otherwise you are leaking information about the global distribution of variables (which is influenced by the test set) into the training set.

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

# Since the dataset is unbalanced (it has many more non-fraud transactions than fraudulent ones), we set a class weight to weight the few fraudulent transactions higher than the many non-fraud transactions.

class_weights = class_weight.compute_class_weight('balanced',classes = np.unique(y_train),y = y_train)
class_weights = {i : class_weights[i] for i in range(len(class_weights))}

### Build the DNN model

In [None]:
# Build the model, the model we build here is a simple fully connected deep neural network, containing 3 hidden layers and one output layer.

model = Sequential()
model.add(Dense(32, name='dense', activation = 'relu', input_dim = len(X.columns)))
model.add(Dropout(0.2))
model.add(Dense(32, name='dense_02'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32, name='dense_03'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1, name='dense_04', activation = 'sigmoid'))
model.compile(optimizer='SGD',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

### Configure MLFlow

In [None]:
# Set up tracking to MLFlow by using the route from earlier. We call the experiment and model DNN-credit-fraud.
# We also enable autologging for MLFlow so we don't have to manually log the model or any metrics besides custom ones.

mlflow.set_tracking_uri(MLFLOW_ROUTE)
mlflow.set_experiment("DNN-credit-card-fraud")
mlflow.tensorflow.autolog(registered_model_name="DNN-credit-card-fraud")

### Train the model, plot the confusion matrix and push the artifacts to MLFlow.

In [None]:
# Train the model.
# We wrap the training with an mlflow wrapper to signify that this is an experiment run.
# We also define a few more metrics at the very bottom to track the confusion matrix in MLFlow.

with mlflow.start_run():
    epochs = 2
    history = model.fit(X_train, y_train, epochs=epochs, \
                        validation_data=(scaler.transform(X_val),y_val), \
                        verbose = True, class_weight = class_weights)

    y_pred_temp = model.predict(scaler.transform(X_test)) 

    threshold = 0.995

    y_pred = np.where(y_pred_temp > threshold, 1,0)
    c_matrix = confusion_matrix(y_test,y_pred)
    ax = sns.heatmap(c_matrix, annot=True, cbar=False, cmap='Blues')
    ax.set_xlabel("Prediction")
    ax.set_ylabel("Actual")
    ax.set_title('Confusion Matrix')
    plt.show()

    t_n, f_p, f_n, t_p = c_matrix.ravel()
    mlflow.log_metric("tn", t_n)
    mlflow.log_metric("fp", f_p)
    mlflow.log_metric("fn", f_n)
    mlflow.log_metric("tp", t_p)

    model_proto,_ = tf2onnx.convert.from_keras(model)
    mlflow.onnx.log_model(model_proto, "models")
    

### Save the model locally in ONNX format.

In [None]:
import onnx
onnx.save(model_proto, "fraud.onnx")

### Save the model locally in tensorflow's **saved_model** format. This is useful for working with different model servers such as Triton.

In [None]:
keras.models.save_model(model, filepath="fraud/1")