In [2]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# IMPT: make shortcut of the BT4012 Project folder inside your google drive so the stuff below works
drive.mount("/content/drive")
trainData = pd.read_csv("/content/drive/My Drive/BT4012 Project/fraudTrain.csv")
testData = pd.read_csv("/content/drive/My Drive/BT4012 Project/fraudTest.csv")
allData = pd.concat([trainData, testData])

#Data Pre-processing

In [3]:
def age_group(row):
    rows = row["age"]
    if rows <20:
        return "less than 20"
    elif rows >=20 and rows<30:
        return "20 to 30"
    elif rows >=30 and rows<40:
        return "30 to 40"
    elif rows >=40 and rows<50:
        return "40 to 50"
    elif rows >=50 and rows<60:
        return "50 to 60"
    elif rows >=60 and rows<70:
        return "60 to 70"
    elif rows >=70 and rows<80:
        return "70 to 80"
    elif rows >=80 and rows<90:
        return "80 to 90"
    else:
        return "greater than 90"

In [4]:
# Check null
count_na = allData.isna().sum().sum() 
print('Count of Na: ' + str(count_na)) # No null values

Count of Na: 0


In [5]:
# Check duplicates
is_false = allData.duplicated().any()
print('Any duplicates?: ' + str(is_false)) # No duplicates so no need drop

Any duplicates?: False


In [6]:
from sklearn.model_selection import train_test_split

# print(allData.corr())

# age grp
allData["age"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).year-pd.DatetimeIndex(allData["dob"]).year
allData["age_group"] = allData.apply(age_group,axis=1)

# trans times / age
allData["trans_hour"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).hour
allData["trans_yrmth"] = pd.DatetimeIndex(allData['trans_date_trans_time']).strftime('%Y-%m')
allData["weekday"] = pd.DatetimeIndex(allData["trans_date_trans_time"]).weekday
allData["date"] = pd.to_datetime(allData['trans_date_trans_time']).dt.date

# drop redundant index column
allData.drop(labels="Unnamed: 0", axis=1, inplace=True)

# calculating the distance between the merchant location and transaction location
allData["latitude_difference"] = abs(allData["lat"]-allData["merch_lat"])
allData["longitude_difference"] = abs(allData["long"]-allData["merch_long"])

# it is estimated that difference between each degree of longitude and lattitude is approximately 69 miles which is 110 km
# applying pythogoras theorem
allData["distance"] = np.sqrt(pow(allData["latitude_difference"]*110,2)+pow(allData["longitude_difference"]*110,2))
allData['distance'].describe()

allData.category = allData.category.astype("category")
allData.gender = allData.gender.astype("category")
allData.state = allData.state.astype("category")
allData.age_group = allData.age_group.astype("category")
allData.trans_yrmth = allData.trans_yrmth.astype("category")
allData.weekday = allData.weekday.astype("category")
allData.job = allData.job.astype("category")
allData.city = allData.city.astype("category")
allData.is_fraud = allData.is_fraud.astype("category")
allData.trans_hour = allData.trans_hour.astype("category")

#can ignore others, just use train, validation, test
train, others = train_test_split(allData, test_size=0.2, random_state=4012)
del(allData)
validation, test = train_test_split(others, test_size=0.5, random_state=4012)
del(others)

onlyFraud = train[train.is_fraud == 1]
noFraud = train[train.is_fraud != 1]

print(onlyFraud)

       trans_date_trans_time            cc_num  \
288598   2019-05-21 23:40:29  6011438889172900   
644699   2019-10-01 01:45:49   346208242862904   
366597   2019-06-19 01:11:01  6506116513503136   
621562   2019-09-21 17:41:09  3565943051129759   
10057    2019-01-07 04:34:48  3597980945245815   
...                      ...               ...   
205282   2019-04-15 03:05:13  3548710338694745   
389704   2020-11-21 03:17:38    30235268718158   
204807   2020-09-01 01:10:19  4003989662068504   
93343    2019-02-24 22:33:25   180036456789979   
351029   2019-06-14 00:17:50  6011948324228984   

                                  merchant       category     amt      first  \
288598                fraud_Abbott-Steuber  personal_care   20.17    Allison   
644699                   fraud_Collier Inc    grocery_net   10.54    Tabitha   
366597               fraud_Dooley-Thompson       misc_net  773.63   Kimberly   
621562                     fraud_Lynch Ltd   shopping_pos  905.66      David   

#Inspecting Data

In [7]:
train.drop(["trans_date_trans_time", "first", "last", "unix_time", "street", "cc_num", "merchant", "age", "city_pop", "zip", "dob", "trans_num"], axis=1, inplace=True)
train.drop(["lat","long","merch_lat","merch_long","latitude_difference","longitude_difference", 'date'], axis=1, inplace=True)

validation.drop(["trans_date_trans_time", "first", "last", "unix_time", "street", "cc_num", "merchant", "age", "city_pop", "zip", "dob", "trans_num"], axis=1, inplace=True)
validation.drop(["lat","long","merch_lat","merch_long","latitude_difference","longitude_difference", 'date'], axis=1, inplace=True)

test.drop(["trans_date_trans_time", "first", "last", "unix_time", "street", "cc_num", "merchant", "age", "city_pop", "zip", "dob", "trans_num"], axis=1, inplace=True)
test.drop(["lat","long","merch_lat","merch_long","latitude_difference","longitude_difference", 'date'], axis=1, inplace=True)

train

Unnamed: 0,category,amt,gender,city,state,job,is_fraud,age_group,trans_hour,trans_yrmth,weekday,distance
1089385,shopping_net,9.11,M,Pembroke,NC,Hospital doctor,0,60 to 70,7,2020-03,6,88.270416
396245,gas_transport,75.57,F,Centerview,MO,Electronics engineer,0,30 to 40,1,2019-06,6,117.115063
816379,misc_net,390.37,M,Oakland,TN,Furniture conservator/restorer,0,20 to 30,5,2019-12,6,56.383604
476241,shopping_pos,96.55,F,Lakeport,CA,Podiatrist,0,40 to 50,12,2020-12,0,108.142655
567642,kids_pets,13.78,M,San Antonio,TX,Exhibition designer,0,40 to 50,20,2019-08,3,98.590270
...,...,...,...,...,...,...,...,...,...,...,...,...
1286615,misc_pos,6.88,F,Sutherland,NE,Insurance broker,0,30 to 40,23,2020-06,1,62.997041
926319,shopping_net,98.85,F,Steuben,ME,Make,0,60 to 70,3,2020-01,3,122.501992
83164,misc_pos,10.48,F,Tulsa,OK,Bookseller,0,50 to 60,9,2020-07,0,82.202315
301454,travel,6.45,F,Allentown,PA,"Scientist, research (maths)",0,20 to 30,23,2019-05,6,22.751546


# Models

In [8]:
fullData = pd.concat([train, test])
fullData.shape

(1667155, 12)

In [9]:
catVars = ['category', 'gender', 'city', 'state', 'job', 'age_group', 'trans_yrmth']

In [10]:
#city and job too many variables prob need to reduce it somehow by grping or smthing
for x in fullData[catVars].columns:
  n = fullData[x].nunique()
  print(f'{x} has {n} unique values')

category has 14 unique values
gender has 2 unique values
city has 906 unique values
state has 51 unique values
job has 497 unique values
age_group has 9 unique values
trans_yrmth has 24 unique values


In [11]:
fullData.drop(columns = {'city', 'job'}, inplace = True)

In [12]:
catVars2 = ['category', 'gender', 'state', 'age_group', 'trans_yrmth']

In [13]:
dfEncode = pd.get_dummies(fullData, columns = catVars2)

In [14]:
dfEncode

Unnamed: 0,amt,is_fraud,trans_hour,weekday,distance,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,...,trans_yrmth_2020-03,trans_yrmth_2020-04,trans_yrmth_2020-05,trans_yrmth_2020-06,trans_yrmth_2020-07,trans_yrmth_2020-08,trans_yrmth_2020-09,trans_yrmth_2020-10,trans_yrmth_2020-11,trans_yrmth_2020-12
1089385,9.11,0,7,6,88.270416,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
396245,75.57,0,1,6,117.115063,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
816379,390.37,0,5,6,56.383604,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
476241,96.55,0,12,0,108.142655,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
567642,13.78,0,20,3,98.590270,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642346,4.08,0,7,0,64.165768,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364564,24.91,0,18,0,63.668794,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
62945,129.96,0,15,2,46.440187,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337492,211.84,0,15,2,49.819146,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
xCols = dfEncode.columns.drop('is_fraud')
xCols

Index(['amt', 'trans_hour', 'weekday', 'distance', 'category_entertainment',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness',
       ...
       'trans_yrmth_2020-03', 'trans_yrmth_2020-04', 'trans_yrmth_2020-05',
       'trans_yrmth_2020-06', 'trans_yrmth_2020-07', 'trans_yrmth_2020-08',
       'trans_yrmth_2020-09', 'trans_yrmth_2020-10', 'trans_yrmth_2020-11',
       'trans_yrmth_2020-12'],
      dtype='object', length=104)

In [16]:
X = dfEncode[xCols]
X

Unnamed: 0,amt,trans_hour,weekday,distance,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,...,trans_yrmth_2020-03,trans_yrmth_2020-04,trans_yrmth_2020-05,trans_yrmth_2020-06,trans_yrmth_2020-07,trans_yrmth_2020-08,trans_yrmth_2020-09,trans_yrmth_2020-10,trans_yrmth_2020-11,trans_yrmth_2020-12
1089385,9.11,7,6,88.270416,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
396245,75.57,1,6,117.115063,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816379,390.37,5,6,56.383604,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
476241,96.55,12,0,108.142655,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
567642,13.78,20,3,98.590270,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642346,4.08,7,0,64.165768,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364564,24.91,18,0,63.668794,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
62945,129.96,15,2,46.440187,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337492,211.84,15,2,49.819146,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
y = dfEncode['is_fraud']
y

1089385    0
396245     0
816379     0
476241     0
567642     0
          ..
642346     0
364564     0
62945      0
337492     0
183963     0
Name: is_fraud, Length: 1667155, dtype: category
Categories (2, int64): [0, 1]

In [18]:
X_train, X_other, Y_train, Y_other = train_test_split(X, y, test_size=0.2, random_state=4012)
X_validation, X_test, Y_validation, Y_test = train_test_split(X_other, Y_other, test_size=0.5, random_state=4012)

In [19]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

## SVM

In [20]:
from sklearn.preprocessing import MinMaxScaler
X_train_trimmed, X_not_used, Y_train_trimmed, Y_not_used = train_test_split(X, y, test_size=0.9, random_state=4012)

scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train_trimmed)
X_train_trimmed = scaling.transform(X_train_trimmed)

In [21]:
from imblearn.over_sampling import SMOTE

# oversampling using SMOTE which will create synthetic samples from the minor class
os = SMOTE(random_state=0)
balanced_X_train_trimmed, balanced_Y_train_trimmed = os.fit_resample(X_train_trimmed, Y_train_trimmed)
columns = X.columns
balanced_X_train_trimmed = pd.DataFrame(data=balanced_X_train_trimmed,columns=columns)
balanced_Y_train_trimmed = pd.DataFrame(data=balanced_Y_train_trimmed,columns=['is_fraud'])


print("length of oversampled data is ",len(balanced_X_train_trimmed))
print("Number of no subscription in oversampled data",len(balanced_Y_train_trimmed[balanced_Y_train_trimmed['is_fraud']==0]))
print("Number of subscription",len(balanced_Y_train_trimmed[balanced_Y_train_trimmed['is_fraud']==1]))
print("Proportion of no subscription data in oversampled data is ",len(balanced_Y_train_trimmed[balanced_Y_train_trimmed['is_fraud']==0])/len(balanced_X_train_trimmed))
print("Proportion of subscription data in oversampled data is ",len(balanced_Y_train_trimmed[balanced_Y_train_trimmed['is_fraud']==1])/len(balanced_X_train_trimmed))

length of oversampled data is  331686
Number of no subscription in oversampled data 165843
Number of subscription 165843
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import time


X_validation = scaling.transform(X_validation)
X_test = scaling.transform(X_test)

classifier = SVC(kernel='rbf', class_weight="balanced", random_state=4012, C=0.05, gamma=1)

time_start = time.perf_counter()

classifier.fit(balanced_X_train_trimmed, balanced_Y_train_trimmed)

training_time = time.perf_counter() - time_start
training_time

y_pred = classifier.predict(X_validation)
y_pred

# Track the performance
performance_tracker = pd.DataFrame(columns=["Model Name", "Training Time", "Accuracy", "Recall", "Precision", "F1", "ROC AUC"])
performance_tracker.loc["SVC rbf"] = ["SVC rbf", training_time, accuracy_score(Y_validation, y_pred), recall_score(Y_validation, y_pred), precision_score(Y_validation, y_pred), f1_score(Y_validation, y_pred), roc_auc_score(Y_validation, y_pred)]
performance_tracker

  y = column_or_1d(y, warn=True)


Unnamed: 0,Model Name,Training Time,Accuracy,Recall,Precision,F1,ROC AUC
SVC rbf,SVC rbf,91614.846633,0.993438,0.093349,0.201511,0.127592,0.545719


In [23]:
classifier = SVC(kernel='linear', class_weight="balanced", random_state=4012, C=0.05, gamma=1)

time_start = time.perf_counter()

classifier.fit(balanced_X_train_trimmed, balanced_Y_train_trimmed)

training_time = time.perf_counter() - time_start
training_time

y_pred = classifier.predict(X_validation)
y_pred

# Track the performance
performance_tracker.loc["SVC linear"] = ["SVC linear", training_time, accuracy_score(Y_validation, y_pred), recall_score(Y_validation, y_pred), precision_score(Y_validation, y_pred), f1_score(Y_validation, y_pred), roc_auc_score(Y_validation, y_pred)]
performance_tracker

  y = column_or_1d(y, warn=True)


Unnamed: 0,Model Name,Training Time,Accuracy,Recall,Precision,F1,ROC AUC
SVC rbf,SVC rbf,91614.846633,0.993438,0.093349,0.201511,0.127592,0.545719
SVC linear,SVC linear,12796.624279,0.907465,0.729288,0.039507,0.074954,0.818837


In [24]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(Y_validation,y_pred))
print("True Negative      False Positive" + "\n" + "False Negative     True Positive")
print('\n')
print(classification_report(Y_validation,y_pred))

[[150663  15195]
 [   232    625]]
True Negative      False Positive
False Negative     True Positive


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    165858
           1       0.04      0.73      0.07       857

    accuracy                           0.91    166715
   macro avg       0.52      0.82      0.51    166715
weighted avg       0.99      0.91      0.95    166715

