# **Fraud Detection Project**

### Importing Necassary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics, model_selection
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Import libraries used for Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

In [None]:
# Retrieving the dataset
df = pd.read_csv("fraud_data1.csv", na_values='?')
df = df.dropna()

### Correlation Matrix Between Features Using HeatMap

In [None]:
# Correlation table
correlation = df.corr()
correlation["Class"].sort_values(ascending=False)
# Plotting Corr. Matrix
plt.figure(figsize=(20, 9))
plt.title("Credit Card Transactions Features Correlation Plot", fontsize=18)
sns.heatmap(
    correlation,
    xticklabels=correlation.columns,
    yticklabels=correlation.columns,
    linewidth=2,
    cmap="Oranges",
    cbar=True,
)
plt.show()

### Eliminating features whose correlation value is greater than 0.75 

In [None]:
columns = list(df.columns)
for column in columns:
    # Skip Class column
    if column == "Class":
        continue
    filtered_columns = [column]
    # Iterate through HeatMap
    for col in df.columns:
        # Skip the diagonals and the class columns
        if (column == col) | (column == "Class"):
            continue
        # Retrieving corr val from df
        cor_val = df[column].corr(df[col])
        # Setting threshold to be 70%
        if cor_val > 0.75:
            columns.remove(col)
            continue
        else:
            filtered_columns.append(col)
    # Keeping only filtered columns
    df = df[filtered_columns]

features = df.drop(["Class"], axis=1)
# We are gonig to scale the featrues with MinMax Scaler
mm_scaler = MinMaxScaler((0,1))
X = mm_scaler.fit_transform(features)

#<--> Lines (31-33) Cited

# Using Chi Squared to filter columns
selector = SelectKBest(chi2, k=21)
selector.fit(X, df["Class"])
filtered_columns = selector.get_support()

# Adding Class column back to dataframe
filtered_data = features.loc[:, filtered_columns]
df = filtered_data.join(df["Class"])

print("After filtering highly correlated features, dataset is ", df.shape)

### Observation
Let's observe the dataset and find connection between fraudulant and normal transactions. 

In [None]:
#temp_df = df.sort_values(by = ["Time"])
frauds = df.query("Class == 1")
normals = df.query("Class == 0")
total = len(df)
fraud_percent = round((len(frauds) / total)*100, 2)
print("The percentage of fraudulant transactions is: ", fraud_percent)

As the percentage of fraudulant transactions is trivial making the dataset strongly imbalanced,so we might apply the following: 
1. Increase the number of samples from minority class
2. Decrease the number of samples from majority class \
\
We can also penalyze the algorithm to achieve better scores for the following metrics: Confusion Matrix, Precision, Recall, F1-score, AOC.

In [None]:
# Function to plot based on the passed arguments
def plot_line(x, y, title="", xlabel="", ylabel="", color=""):
    plt.plot(x, y, color)
    plt.title(title, fontsize=14)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
    plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))

plt.figure(figsize=(16, 7))
# Normal transactions on amount vs time chart
plt.subplot(1, 2, 1)
plot_line(
    x=normals["Time"],
    y=normals["Amount"],
    title="Normal Transactions over Time",
    xlabel="Time [s]",
    ylabel="Transactions Amount",
    color="g-o",
)
# Fraudulant transactions on amount vs time chart
plt.subplot(1, 2, 2)
plot_line(
    x=frauds["Time"],
    y=frauds["Amount"],
    title="Fraudulent Transactions over Time",
    xlabel="Time [s]",
    ylabel="Transactions Amount",
    color="r-o",
)

In [None]:
std_scaler = StandardScaler()
# Retrieving amount values to rescale
amount_vals = df['Amount'].values
# Standard Scaling amount values and reshaping as a column.
df['Amount'] = std_scaler.fit_transform(amount_vals.reshape(-1,1))

### Splitting Data & Creating Training Structures

In [None]:
   # As the dataset is huge and it takes forever to run our models, 
# we're going to reduce the sample size to 50k choosing indices randomly
indices = np.array(df.loc[df['Class'] == 0].index)
rands = np.random.choice(indices, 19502, replace = False)
random_normals = df.iloc[rands, :]
frames = [frauds, random_normals]
df = pd.concat(frames)
df = df.sample(frac=1).reset_index(drop=True)

y = np.array(df['Class'])
X = np.array(df.loc[:, df.columns != 'Class'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state=42, shuffle=True)

print("Before oversampling the data: ")
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_val.shape)
print("\n")

# We are going to oversample the dataset and create new synthetic data samples resembling the minority class
ros = RandomOverSampler(sampling_strategy="minority", random_state=1)
X_train, y_train = ros.fit_resample(X_train, y_train)
#X_over_test, y_over_test = ros.fit_resample(X_test, y_test)

print("After oversampling the data: ")
print("X_over_train Shape: ", X_train.shape)
print("X_test Shape: ", X_val.shape)

### Logistic Regression

In [None]:
# Let's start by testing logistic regression accuracy without regularization on a variety of different feature transformations
for i in range(1, 4):
    poly = PolynomialFeatures(i)
    X_train_new = poly.fit_transform(X_train)
    X_val_new = poly.fit_transform(X_val)
    logreg = LogisticRegression(max_iter=100000, penalty='none')
    logreg.fit(X_train_new, y_train)
    y_pred_train = logreg.predict(X_train_new)
    y_pred_val = logreg.predict(X_val_new)
    print(f'Validation Recall of logistic regression, no regularization, and transformation X^{i}: {recall_score(y_val, y_pred_val)}')
    print(f'Train Recall of logistic regression, no regularization, and transformation X^{i}: {recall_score(y_train, y_pred_train)}')

# We see that polynomial transformation of degree 1 (i.e no transformation) performs best, 
# so will use this for the next step.

# The next step will be using K-fold cross validation to help us identify the best hyperparameters
# as well as see if we can get our model's performance to improve and decrease overfitting by
# adding regularization.

best_score = [0, 0, 0]
best_y_pred = None

print("--------------------")
C = [0.001, 0.1, 1, 10, 100, 1000, 10000]
for k in range(4):
    L2_logreg = LogisticRegressionCV(Cs=C, cv=k+3, max_iter=100000)
    L2_logreg.fit(X_train, y_train)
    y_pred = L2_logreg.predict(X_val)
    score = recall_score(y_val, y_pred)
    print(f'Logistic Regression Accuracy with K^{k+3}, L2 Regularization and C={max(L2_logreg.C_)} is: {score}')

    if score > best_score[0]:
        best_score[0], best_score[1] = score, max(L2_logreg.C_)
        best_score[2] = 2
        best_y_pred = y_pred

    L1_logreg = LogisticRegressionCV(Cs=C, cv=k+3, max_iter=100000, penalty='l1', solver='liblinear')
    L1_logreg.fit(X_train, y_train)
    y_pred = L1_logreg.predict(X_val)
    score = recall_score(y_val, y_pred)
    print(f'Logistic Regression Accuracy with K^{k+3}, L1 Regularization and C={max(L1_logreg.C_)} is: {score}')

    if score > best_score[0]:
        best_score[0], best_score[1] = score, max(L1_logreg.C_)
        best_score[2] = 1
        best_y_pred = y_pred

print("-------------")
print(f"Our best results were L{best_score[2]} Regularization with C={best_score[1]} and score = {best_score[0]}")

In [None]:
import seaborn as sns
from sklearn import metrics

cm = metrics.confusion_matrix(y_val, best_y_pred)
sns.heatmap(cm, annot=True, fmt=".3f", cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = f'Recall Score: {best_score[0]}'
plt.title(all_sample_title)

In [None]:
print('Precision: %.3f' % precision_score(y_val, best_y_pred))
print('Recall: %.3f' % recall_score(y_val, best_y_pred))
print('Accuracy: %.3f' % accuracy_score(y_val, best_y_pred))
print('F1-Score: %.3f' % f1_score(y_val, best_y_pred))

### Support Vector Machine (SVM)

For the SVM, we will try a variety of feature transformations and regularization techniques using sklearn's SVM library including the following:
- Linear Kernel and L1 (Lasso regression)
- Polynomial, degree 2, L2 regularization
- Gaussian RBF with L2 regularization

We will make a plot of the test and train accuracies for all of this variation across different choices of the hyperparamter lambda.

In [None]:
# import SVC classifier, 
from sklearn.svm import SVC, LinearSVC

# Initialize some lambda values to validation across SVM choices
my_lambdas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

We'll start off by testing variations of linear SVM with lasso regression (L1 regularization)

In [None]:
import warnings
val_results, train_results = [], []


for C in my_lambdas:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        svm = LinearSVC(C=C, loss='hinge')
        svm.fit(X_train,y_train)

        y_pred_train = svm.predict(X_train)
        y_pred_val = svm.predict(X_val)
        
        train_score = recall_score(y_train, y_pred_train)
        val_scores = recall_score(y_val, y_pred_val)
        print(f"C={C}, Train Recall = {train_score}, Validation Recall ={val_scores}")

    train_results.append(train_score)
    val_results.append(val_scores)
    

plt.xscale("log")
plt.plot(my_lambdas, train_results, "hotpink", label="train")
plt.plot(my_lambdas, val_results, "teal", label="val")
plt.legend(loc="lower right")
plt.xlabel("Values of C")
plt.ylabel("Accuracy")
plt.title("SVM with Linear Kernel and L1 Lasso")
plt.show()

We can move onto testing a new kernel function, polynomial of degree 2 because we already have lots of features. This will also use L2 regularization.

In [None]:
val_results, train_results = [], []

for C in my_lambdas:
    svm = SVC(C=C, kernel='poly', degree=2) # L2 regularization added by default
    svm.fit(X_train,y_train)
    
    y_pred_train = svm.predict(X_train)
    y_pred_val = svm.predict(X_val)
    
    train_score = recall_score(y_train, y_pred_train)
    val_score = recall_score(y_val, y_pred_val)
    print(f"C={C}, Train Recall = {train_score}, Validation Recall = {val_score}")

    train_results.append(train_score)
    val_results.append(val_score)

plt.xscale("log")
plt.plot(my_lambdas, train_results, "hotpink", label="train")
plt.plot(my_lambdas, val_results, "teal", label="val")
plt.legend(loc="upper left")
plt.xlabel("Values of C")
plt.ylabel("Accuracy")
plt.title("SVM with Polynomial Kernel and L2 Ridge")
plt.show()

Finally, we'll last test the Gaussian RBF kernel transformation with ridge regression (L2).

In [None]:
val_results, train_results = [], []

for C in my_lambdas:
    svm = SVC(C=C) # Default SVC uses RBF and L2
    svm.fit(X_train,y_train)
    
    y_pred_train = svm.predict(X_train)
    y_pre_val = svm.predict(X_val)
    
    train_score = recall_score(y_train, y_pred_train)
    val_score = recall_score(y_val, y_pre_val)
    print(f"C={C}, Train Recall = {train_score}, Validation Recall = {val_score}")

    train_results.append(train_score)
    val_results.append(val_score)
    
plt.xscale("log")
plt.plot(my_lambdas, train_results, "hotpink", label="train")
plt.plot(my_lambdas, val_results, "teal", label="val")
plt.legend(loc="upper left")
plt.xlabel("Values of C")
plt.ylabel("Accuracy")
plt.title("SVM with RBF and L2 Ridge")
plt.show()


### Function to Plot by Passing Arguments

In [None]:
def save_figure(C, train_results, val_results, title, path):
    plt.xscale("log")
    plt.plot(C, train_results, "hotpink", label="train")
    plt.plot(C, val_results, "teal", label="val")
    plt.legend(loc="lower right")
    plt.xlabel("Values of C")
    plt.ylabel("Recall")
    plt.title(title)
    plt.savefig(path+'/'+title, dpi=500)
    plt.close()
    return None

### Artificial Neural Network (ANN)

In [None]:
# ANN with 1 hidden layers with sizes of (200, )
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers, metrics, backend
import os

# Declaring path to figure directory
curr_dir = os.getcwd()
figure_path = curr_dir+'/'+"ANN Figures"

C = [0.1, 10, 1000, 100000]

scores = pd.DataFrame(columns = ("C", "0.1", "10", "1000", "100000"))
best_recall = 0
best_model_data = []
best_y_pred = ...

# We're going to use only ReLU activation as sigmoid and others performed poorly
# Input layer params
input_layers = [("Input 200", 200, 21, regularizers.L2, "L2"),
                #("Input 200", 200, 21, regularizers.L1, "L1"),
                #("Input 250", 250, 21, regularizers.L2, "L2")
                ("Input 250", 250, 21, regularizers.L1, "L1")
                ]
# Hidden layer params
hidden_layers = [("None", None, None, "None"),
                ("Hidden 100", 100,  regularizers.L1, "L1"),
                #("Hidden 150", 150,  regularizers.L1, "L1"),
                #("Hidden 100", 100,  regularizers.L2, "L2"),
                ("Hidden 150", 150,  regularizers.L2, "L2")
                ]

# We need to use Sequential model to add new layers and experiment
model = Sequential()
epochs = 2

for input_index, (input_name, input_layer_size, input_size, input_reg, ireg_name) in enumerate(input_layers):
    
    for hidden_index, (hidden_name, hidden_layer_size, hidden_reg, hreg_name) in enumerate(hidden_layers):
        model_state = []
        model_state.append(str(input_layer_size)+", "+str(hidden_layer_size)+"\n"+ireg_name+" "+hreg_name)
        name = "{}, Reg: {} Hidden: {}, Reg: {}".format(input_name, ireg_name, hidden_name, hreg_name)
        val_results, train_results = [], []
        # Trying different C values
        for c_val in C:
            # Adding an input layer with params from input_layers
            model.add(Dense(input_layer_size, input_shape=(input_size,), kernel_regularizer = input_reg(1/c_val), activation='relu', name="layer1"))

            if hidden_name != "None":
                # Adding hidden layer with params from hidden_layers
                model.add(Dense(hidden_layer_size,  kernel_regularizer = hidden_reg(1/c_val), activation='relu', name="layer2"))
            # Adding Output Layer
            model.add(Dense(1, activation='sigmoid', name="layer3"))

            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', metrics.Precision(), metrics.Recall()])
            model.fit(X_train, y_train, epochs=epochs, batch_size=10)
            print("C Value: ", c_val)
            train_results.append(model.evaluate(X_train, y_train)[3])
            results = model.evaluate(X_val, y_val)
            val_results.append(results[3])
            # Chosing best model based on recall / Full Report
            if results[3] > best_recall and results[3] < 0.9999:
                best_recall = results[3]
                name = "{}, Reg: {} Hidden: {}, Reg: {}".format(input_name, ireg_name, hidden_name, hreg_name)
                best_model = [name, epochs, c_val, results]
                best_y_pred = np.argmax(model.predict(X_val),axis=1)

            # Saving recall in model state for current configuration
            model_state.append(results[3])
            backend.clear_session()
            model.pop()
            # Pop hidden layer after computation
            if hidden_name != "None":
                model.pop()
            model.pop()  
        # Adding trained model's state in scores dataframe
        scores.loc[len(scores.index)] = model_state
        # Saving Figure as an Image
        save_figure(C, train_results, val_results, name, figure_path)
del model

In [None]:
from sklearn import metrics as mt
# Confusion Matrix for Best Model
cm = mt.confusion_matrix(y_val, best_y_pred)
sns.heatmap(cm, annot=True, fmt=".3f", cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = f'Recall Score: {best_model[3][3]}'
plt.title(all_sample_title)
# Storing scores dataframe to external file
scores.to_csv("scores_output.csv",index = False)
print(best_model)