# Fraud Detection Using Binary Classification

In [1]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import json

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import Data
df = pd.read_csv('drive/My Drive/zz_fraud_detection/fraud_data.csv')

In [3]:
df.head(3)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0


In [4]:
df.isna().sum() # No missing data points

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

# Setup data for ML

In [5]:
df_temp = df.copy()
df_temp.drop(['nameOrig','nameDest'],axis=1, inplace=True)
df_temp.head(5)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [6]:
# Turn categorical variables into numbers
for label, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # # Add binary column to inidicate whether sample had missing value
        # df_temp[label+"_is_missing"] = pd.isnull(content)
        # missing categories set as -1
        df_temp[label] = pd.Categorical(content).codes + 1  

In [7]:
print(df_temp.head(5)) # Check new data
print(df_temp['isFlaggedFraud'].unique()) # Make sure there are some flagged

   step  type    amount  ...  newbalanceDest  isFraud  isFlaggedFraud
0     1     4   9839.64  ...             0.0        0               0
1     1     4   1864.28  ...             0.0        0               0
2     1     5    181.00  ...             0.0        1               0
3     1     2    181.00  ...             0.0        1               0
4     1     4  11668.14  ...             0.0        0               0

[5 rows x 9 columns]
[0 1]


In [8]:
X = df_temp.drop('isFraud',axis=1)
y = df_temp['isFraud']

In [9]:
print(X.head(3))
print(y.head(3))

   step  type   amount  ...  oldbalanceDest  newbalanceDest  isFlaggedFraud
0     1     4  9839.64  ...             0.0             0.0               0
1     1     4  1864.28  ...             0.0             0.0               0
2     1     5   181.00  ...             0.0             0.0               0

[3 rows x 8 columns]
0    0
1    0
2    1
Name: isFraud, dtype: int64


In [10]:
# Split Data into Trainding and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train Binary Classification Model
* Unsure what model to use so will test 3 different types
  * KNN, Logistic Regression, and Random Forest

In [11]:
# Store Model Types
model_types = {"KNN": KNeighborsClassifier(),
               "Logistic Regression": LogisticRegression(), 
               "Random Forest": RandomForestClassifier()}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models = a dict of different Scikit-Learn machine learning models
    X_train = training data
    X_test = testing data
    y_train = targets for training
    y_test = targets for test data
    """
    # Random seed for reproducible results
    np.random.seed(1)
    model_scores = {} # Stores Scores

    for name, model in models.items():
        model.fit(X_train, y_train) # Fit Model
        model_scores[name] = model.score(X_test, y_test) # Evaluation
    return model_scores

In [12]:
# Model Runtime ~
# model_scores = fit_and_score(model_types, X_train, X_test, y_train, y_test) 
# model_scores

In [13]:
# os.chdir('drive/My Drive/zz_fraud_detection')
# with open('model_scores.json', 'w') as json_file:
#   json.dump(model_scores, json_file)

# Test Best Model

In [14]:
# TODO: Complete model evaluation
# TODO: Adjust best model