## Libraries

In [2]:
import os
import numpy as np
import pandas as pd

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

import xgboost

## Downloading

In [2]:
Data = pd.read_csv('../fraudTest.csv', index_col=0) # nrows=10000, 

## Exploratory Data Analysis

In [3]:
print(Data.columns)

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


In [4]:
display(pd.concat([Data.dtypes.to_frame('Type'),Data.nunique().to_frame('Count')], axis=1).sort_values('Type'))

Unnamed: 0,Type,Count
is_fraud,int64,2
cc_num,int64,924
unix_time,int64,544760
city_pop,int64,835
zip,int64,912
merch_lat,float64,546490
amt,float64,37256
long,float64,910
merch_long,float64,551770
lat,float64,910


In [5]:
display(Data.describe())

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [6]:
display(Data['is_fraud'].value_counts())

is_fraud
0    553574
1      2145
Name: count, dtype: int64

## Train/Test Spliting

In [None]:
X = Data.drop('is_fraud', axis = 1)
y = Data['is_fraud']

category_dict = {}
# Loop through each object-type column in X
for col in X.select_dtypes(include='object').columns:
    # Get the full set of all unique categories in this column
    category_dict[col] = pd.unique(X[col])

# Saving the dictionary to a file
with open('category_dict.pkl', 'wb') as file:
    pickle.dump(category_dict, file)
del(category_dict)

# Transform training data and save category mappings for categorical columns
X = X.apply(lambda col: col.astype('category') if col.dtype == 'object' else col, axis = 0)


# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

NameError: name 'Data' is not defined

In [8]:
print(category_dict['gender'])
print(category_dict['category'])

['M' 'F']
['personal_care' 'health_fitness' 'misc_pos' 'travel' 'kids_pets'
 'shopping_pos' 'food_dining' 'home' 'entertainment' 'shopping_net'
 'misc_net' 'grocery_pos' 'gas_transport' 'grocery_net']


## Training

In [9]:
xg_boost_model =  xgboost.XGBClassifier(enable_categorical=True, random_state=42, max_depth=8, min_child_weight=0.5, 
                                  n_estimators=100, gamma=3., reg_alpha=1., reg_lambda=1.)
xg_boost_model.fit(X_train, y_train)

y_pred = xg_boost_model.predict(X_test)

print("XG-Boost accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
print('---------------------')
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print('---------------------')
print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))

XG-Boost accuracy: 0.9985
---------------------
Confusion matrix:
[[110683     35]
 [   129    297]]
---------------------
Classification report:
              precision    recall  f1-score   support

           0     0.9988    0.9997    0.9993    110718
           1     0.8946    0.6972    0.7836       426

    accuracy                         0.9985    111144
   macro avg     0.9467    0.8484    0.8915    111144
weighted avg     0.9984    0.9985    0.9984    111144



## Saving the model

In [10]:
xg_boost_model.save_model('../models/xgb_model.json')
del(xg_boost_model)

## Loading and Using

In [11]:
xg_boost_model = xgboost.XGBClassifier()
xg_boost_model.load_model('../models/xgb_model.json')

In [12]:
# A few checks
y_ = y_test[y_test==0] 
X_ = X_test[y_test==0] 
print(confusion_matrix(y_, xg_boost_model.predict(X_)))
y_ = y_test[y_test==1] 
X_ = X_test[y_test==1] 
print(confusion_matrix(y_, xg_boost_model.predict(X_)))
del(y_, X_)

[[110683     35]
 [     0      0]]
[[  0   0]
 [129 297]]


In [13]:
# The first non-zero prediction
y_pred = xg_boost_model.predict(X)
i = np.argmax([y_pred == 1])
print(i)
print(y_pred[i])

1767
1


In [14]:
# The JSON file for a single data point
my_dict = X.iloc[i,:].to_dict()
print(my_dict)

{'trans_date_trans_time': '2020-06-21 22:32:22', 'cc_num': 6564459919350820, 'merchant': 'fraud_Rodriguez, Yost and Jenkins', 'category': 'misc_net', 'amt': 780.52, 'first': 'Douglas', 'last': 'Willis', 'gender': 'M', 'street': '619 Jeremy Garden Apt. 681', 'city': 'Benton', 'state': 'WI', 'zip': 53803, 'lat': 42.5545, 'long': -90.3508, 'city_pop': 1306, 'job': 'Public relations officer', 'dob': '1958-09-10', 'trans_num': 'ab4b379d2c0c9c667d46508d4e126d72', 'unix_time': 1371853942, 'merch_lat': 42.461127000000005, 'merch_long': -91.147148}


In [15]:
# One-row datarame with a single data point known to yield the 1-perdiction
df = pd.DataFrame([my_dict])
display(df)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long
0,2020-06-21 22:32:22,6564459919350820,"fraud_Rodriguez, Yost and Jenkins",misc_net,780.52,Douglas,Willis,M,619 Jeremy Garden Apt. 681,Benton,...,53803,42.5545,-90.3508,1306,Public relations officer,1958-09-10,ab4b379d2c0c9c667d46508d4e126d72,1371853942,42.461127,-91.147148


In [16]:
# The column types are different
print((df.dtypes == X.dtypes).all())

False


In [17]:
# Convert object columns to categorical columns
for col, categories in category_dict.items():
    if col in df.columns:
        # Convert the column in df to categorical using the categories from category_dict
        df[col] = pd.Categorical(df[col], categories=categories)

In [18]:
# The column types are the same
print((df.dtypes == X.dtypes).all())

True


In [19]:
print(xg_boost_model.predict(df)) # As expected

[1]
