In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# IMPT: make shortcut of the BT4012 Project folder inside your google drive so the stuff below works
drive.mount("/content/drive")
trainData = pd.read_csv("/content/drive/My Drive/BT4012 Project/fraudTrain.csv")
testData = pd.read_csv("/content/drive/My Drive/BT4012 Project/fraudTest.csv")
# maybe can combine both sets of data then split on our own for more credits?
allData = pd.concat([trainData, testData])

Mounted at /content/drive


In [None]:
def age_group(row):
    rows = row["age"]
    if rows <20:
        return "less than 20"
    elif rows >=20 and rows<30:
        return "20 to 30"
    elif rows >=30 and rows<40:
        return "30 to 40"
    elif rows >=40 and rows<50:
        return "40 to 50"
    elif rows >=50 and rows<60:
        return "50 to 60"
    elif rows >=60 and rows<70:
        return "60 to 70"
    elif rows >=70 and rows<80:
        return "70 to 80"
    elif rows >=80 and rows<90:
        return "80 to 90"
    else:
        return "greater than 90"

In [None]:
from sklearn.model_selection import train_test_split

# print(allData.corr())

# age grp
allData["age"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).year-pd.DatetimeIndex(allData["dob"]).year
allData["age_group"] = allData.apply(age_group,axis=1)

# trans times / age
allData["trans_hour"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).hour
allData["trans_yrmth"] = pd.DatetimeIndex(allData['trans_date_trans_time']).strftime('%Y-%m')
allData["weekday"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).weekday
allData["date"] = pd.to_datetime(allData['trans_date_trans_time']).dt.date

# drop redundant index column
allData.drop(labels="Unnamed: 0", axis=1, inplace=True)

# calculating the distance between the merchant location and transaction location
allData["latitude_difference"] = abs(allData["lat"]-allData["merch_lat"])
allData["longitude_difference"] = abs(allData["long"]-allData["merch_long"])

# it is estimated that difference between each degree of longitude and lattitude is approximately 69 miles which is 110 km
# applying pythogoras theorem
allData["distance"] = np.sqrt(pow(allData["latitude_difference"]*110,2)+pow(allData["longitude_difference"]*110,2))
allData['distance'].describe()

allData.category = allData.category.astype("category")
allData.gender = allData.gender.astype("category")
allData.state = allData.state.astype("category")
allData.age_group = allData.age_group.astype("category")
allData.trans_yrmth = allData.trans_yrmth.astype("category")
allData.weekday = allData.weekday.astype("category")
allData.job = allData.job.astype("category")
allData.city = allData.city.astype("category")
allData.is_fraud = allData.is_fraud.astype("category")
allData.trans_hour = allData.trans_hour.astype("category")

#can ignore others, just use train, validation, test
train, test = train_test_split(allData, test_size=0.2, random_state=4012)
del(allData)

onlyFraud = train[train.is_fraud == 1]
noFraud = train[train.is_fraud != 1]

print(onlyFraud)

       trans_date_trans_time            cc_num  \
288598   2019-05-21 23:40:29  6011438889172900   
644699   2019-10-01 01:45:49   346208242862904   
366597   2019-06-19 01:11:01  6506116513503136   
621562   2019-09-21 17:41:09  3565943051129759   
10057    2019-01-07 04:34:48  3597980945245815   
...                      ...               ...   
205282   2019-04-15 03:05:13  3548710338694745   
389704   2020-11-21 03:17:38    30235268718158   
204807   2020-09-01 01:10:19  4003989662068504   
93343    2019-02-24 22:33:25   180036456789979   
351029   2019-06-14 00:17:50  6011948324228984   

                                  merchant       category     amt      first  \
288598                fraud_Abbott-Steuber  personal_care   20.17    Allison   
644699                   fraud_Collier Inc    grocery_net   10.54    Tabitha   
366597               fraud_Dooley-Thompson       misc_net  773.63   Kimberly   
621562                     fraud_Lynch Ltd   shopping_pos  905.66      David   

In [None]:
train.drop(["trans_date_trans_time", "first", "last", "unix_time", "street", "cc_num", "merchant", "age", "city_pop", "zip", "dob", "trans_num"], axis=1, inplace=True)
train.drop(["lat","long","merch_lat","merch_long","latitude_difference","longitude_difference", 'date'], axis=1, inplace=True)

test.drop(["trans_date_trans_time", "first", "last", "unix_time", "street", "cc_num", "merchant", "age", "city_pop", "zip", "dob", "trans_num"], axis=1, inplace=True)
test.drop(["lat","long","merch_lat","merch_long","latitude_difference","longitude_difference", 'date'], axis=1, inplace=True)

In [None]:
X_train = train.drop(['is_fraud'], axis=1)
y_train = train['is_fraud']

X_test = test.drop(['is_fraud'], axis=1)
y_test = test['is_fraud']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['amt', 'distance']] = scaler.fit_transform(X_train[['amt', 'distance']])
X_test[['amt', 'distance']] = scaler.transform(X_test[['amt', 'distance']])
X_train

Unnamed: 0,category,amt,gender,city,state,job,age_group,trans_hour,trans_yrmth,weekday,distance
1089385,shopping_net,-0.383380,M,Pembroke,NC,Hospital doctor,60 to 70,7,2020-03,6,0.128891
396245,gas_transport,0.034714,F,Centerview,MO,Electronics engineer,30 to 40,1,2019-06,6,1.050038
816379,misc_net,2.015096,M,Oakland,TN,Furniture conservator/restorer,20 to 30,5,2019-12,6,-0.889407
476241,shopping_pos,0.166698,F,Lakeport,CA,Podiatrist,40 to 50,12,2020-12,0,0.763506
567642,kids_pets,-0.354002,M,San Antonio,TX,Exhibition designer,40 to 50,20,2019-08,3,0.458453
...,...,...,...,...,...,...,...,...,...,...,...
1286615,misc_pos,-0.397409,F,Sutherland,NE,Insurance broker,30 to 40,23,2020-06,1,-0.678208
926319,shopping_net,0.181167,F,Steuben,ME,Make,60 to 70,3,2020-01,3,1.222068
83164,misc_pos,-0.374762,F,Tulsa,OK,Bookseller,50 to 60,9,2020-07,0,-0.064892
301454,travel,-0.400114,F,Allentown,PA,"Scientist, research (maths)",20 to 30,23,2019-05,6,-1.963438


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.neural_network import MLPClassifier
import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [None]:
numeric_transformer = Pipeline(steps=[
    ("simple_imputer", SimpleImputer(strategy="mean")),
    ("mean", StandardScaler()) # Standardization
])

categorical_transformer = Pipeline(steps=[
    ("simple_imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric_transformer", numeric_transformer, make_column_selector(dtype_exclude="category")),
    ("categorical_transformer", categorical_transformer, make_column_selector(dtype_include="category"))
], remainder="passthrough")

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logistic_regression", MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=300, activation='relu', random_state=4012))
])

time_start = time.perf_counter()

pipeline.fit(X_train, y_train)

training_time = time.perf_counter() - time_start
training_time

y_pred = pd.Series(pipeline.predict(X_test))
y_pred

# Track the performance of various models
performance_tracker = pd.DataFrame(columns=["Model Name", "Training Time", "Accuracy", "Recall", "Precision", "F1", "ROC AUC"])
performance_tracker.loc["Logistic Regression"] = ["Logistic Regression", training_time, accuracy_score(test.is_fraud, y_pred), recall_score(test.is_fraud, y_pred), precision_score(test.is_fraud, y_pred), f1_score(test.is_fraud, y_pred), roc_auc_score(test.is_fraud, y_pred)]
performance_tracker

Unnamed: 0,Model Name,Training Time,Accuracy,Recall,Precision,F1,ROC AUC
Logistic Regression,Logistic Regression,3123.473949,0.999015,0.87455,0.933589,0.903106,0.937111


In [None]:
numeric_transformer = Pipeline(steps=[
    ("simple_imputer", SimpleImputer(strategy="mean")),
    ("mean", StandardScaler()) # Standardization
])

categorical_transformer = Pipeline(steps=[
    ("simple_imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric_transformer", numeric_transformer, make_column_selector(dtype_exclude="category")),
    ("categorical_transformer", categorical_transformer, make_column_selector(dtype_include="category"))
], remainder="passthrough")

pipeline2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logistic_regression", MLPClassifier(hidden_layer_sizes=(300, 300), max_iter=500, activation='relu', random_state=4012))
])

time_start = time.perf_counter()

pipeline2.fit(X_train, y_train)

training_time = time.perf_counter() - time_start
training_time

y_pred_2 = pd.Series(pipeline2.predict(X_test))
y_pred_2

# Track the performance of various models
performance_tracker = pd.DataFrame(columns=["Model Name", "Training Time", "Accuracy", "Recall", "Precision", "F1", "ROC AUC"])
performance_tracker.loc["Logistic Regression"] = ["Logistic Regression", training_time, accuracy_score(test.is_fraud, y_pred_2), recall_score(test.is_fraud, y_pred_2), precision_score(test.is_fraud, y_pred_2), f1_score(test.is_fraud, y_pred_2), roc_auc_score(test.is_fraud, y_pred_2)]
performance_tracker

Unnamed: 0,Model Name,Training Time,Accuracy,Recall,Precision,F1,ROC AUC
Logistic Regression,Logistic Regression,2759.522685,0.998796,0.89563,0.877582,0.886514,0.947485


In [None]:
pipeline3 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logistic_regression", MLPClassifier(hidden_layer_sizes=(1000, 1000), max_iter=500, activation='relu', random_state=4012))
])

time_start = time.perf_counter()

pipeline3.fit(X_train, y_train)

training_time = time.perf_counter() - time_start
training_time

y_pred_3 = pd.Series(pipeline3.predict(X_test))
y_pred_3

# Track the performance of various models
performance_tracker = pd.DataFrame(columns=["Model Name", "Training Time", "Accuracy", "Recall", "Precision", "F1", "ROC AUC"])
performance_tracker.loc["Logistic Regression"] = ["Logistic Regression", training_time, accuracy_score(test.is_fraud, y_pred_3), recall_score(test.is_fraud, y_pred_3), precision_score(test.is_fraud, y_pred_3), f1_score(test.is_fraud, y_pred_3), roc_auc_score(test.is_fraud, y_pred_3)]
performance_tracker

Unnamed: 0,Model Name,Training Time,Accuracy,Recall,Precision,F1,ROC AUC
Logistic Regression,Logistic Regression,12575.085395,0.998858,0.899229,0.885121,0.892119,0.949306
