In [None]:
"""
based on: https://www.kaggle.com/code/zwhjorth/dnn-svm-and-dt-for-fraud-detection
"""

In [None]:
!pip install -r requirements.txt

In [None]:
import os
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns

import mlflow
from sklearn.linear_model import LogisticRegression

In [None]:
#Get Env var
MLFLOW_ROUTE = os.getenv("MLFLOW_ROUTE")

In [None]:
#Fetch data
Data = pd.read_csv('./data/card_transdata.csv')
Data.head()

In [None]:
#Normalize data
Data['Log_home'] = np.log10(Data['distance_from_home'])
Data['Log_trans'] = np.log10(Data['distance_from_last_transaction'])
Data['Log_ratio'] = np.log10(Data['ratio_to_median_purchase_price'])

In [None]:
X = Data.drop(columns = ['fraud', 'Log_home','Log_trans','Log_ratio'])
y = Data['fraud']

# Splitting the data into test and train

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.2, stratify = y_train)

# It is important to only fit the scaler to the training data, otherwise you are leaking
# information about the global distribution of variables (which is influenced by the test set)
# into the train set.

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

# Getting class weights

class_weights = class_weight.compute_class_weight('balanced',classes = np.unique(y_train),y = y_train)
class_weights = {i : class_weights[i] for i in range(len(class_weights))}

In [None]:
model = Sequential()
model.add(Dense(32, activation = 'relu', input_dim = len(X.columns)))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
mlflow.set_tracking_uri(MLFLOW_ROUTE)
mlflow.set_experiment("DNN-credit-fraud")
mlflow.tensorflow.autolog(registered_model_name="DNN-credit-fraud")

In [None]:
with mlflow.start_run():
    epochs = 2
    history = model.fit(X_train, y_train, epochs=epochs, \
                        validation_data=(scaler.transform(X_val),y_val), \
                        verbose = True, class_weight = class_weights)
    # mlflow.log_param("epochs", epochs)
    # mlflow.log_metric("val_loss", history.history['val_loss'][0])
    # mlflow.log_metric("val_accuracy", history.history['val_accuracy'][0])
    # mlflow.tensorflow.log_model(model)
    y_pred_temp = model.predict(scaler.transform(X_test)) 

    threshold = 0.995

    y_pred = np.where(y_pred_temp > threshold, 1,0)
    c_matrix = confusion_matrix(y_test,y_pred)
    ax = sns.heatmap(c_matrix, annot=True, cbar=False, cmap='Blues')
    ax.set_xlabel("Prediction")
    ax.set_ylabel("Actual")
    ax.set_title('Confusion Matrix')
    plt.show()

    t_n, f_p, f_n, t_p = c_matrix.ravel()
    mlflow.log_metric("tn", t_n)
    mlflow.log_metric("fp", f_p)
    mlflow.log_metric("fn", f_n)
    mlflow.log_metric("tp", t_p)

    # mlflow.tensorflow.log_model(history)

    
    