## Libraries

In [28]:
import requests, json
import os
import numpy as np
import pandas as pd

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

import xgboost

### Find what columns are used in the current API request

In [29]:
url = 'https://real-time-payments-api.herokuapp.com/current-transactions'
headers = {'accept': 'application/json'}

# Send the GET request
response = requests.get(url, headers=headers)

# Check the response status and print the result
if response.status_code == 200:
    # what columns are there in an API request
    columns_in_api = json.loads(response.json())['columns']

In [30]:
print(columns_in_api)

['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'merch_lat', 'merch_long', 'is_fraud', 'current_time']


## Downloading

In [31]:
Data = pd.read_csv('../fraudTest.csv', index_col=0) # nrows=10000, 
print(Data.shape)

(555719, 22)


## Exploratory Data Analysis

In [32]:
print(sorted(Data.columns))

['amt', 'category', 'cc_num', 'city', 'city_pop', 'dob', 'first', 'gender', 'is_fraud', 'job', 'last', 'lat', 'long', 'merch_lat', 'merch_long', 'merchant', 'state', 'street', 'trans_date_trans_time', 'trans_num', 'unix_time', 'zip']


In [33]:
# Missing in the API
print(list(set(Data.columns) - set(columns_in_api)))
# Missing in the database
print(list(set(columns_in_api) - set(Data.columns)))
# Total in both
print(len(list(set(columns_in_api) & set(Data.columns))))
# is_fraud is in both
print('is_fraud' in (set(columns_in_api) & set(Data.columns)))

['trans_date_trans_time', 'unix_time']
['current_time']
20
True


In [34]:
Data = Data[list(set(columns_in_api) & set(Data.columns))]
print(Data.shape)

(555719, 20)


In [35]:
# The list of columns appearing both in the API request and in the large training dataset
with open('expected_columns.json', 'w') as file:
    json.dump(list(Data.columns), file)

In [36]:
display(pd.concat([Data.dtypes.to_frame('Type'),Data.nunique().to_frame('Count')], axis=1).sort_values('Type'))

Unnamed: 0,Type,Count
zip,int64,912
is_fraud,int64,2
city_pop,int64,835
cc_num,int64,924
long,float64,910
lat,float64,910
merch_lat,float64,546490
merch_long,float64,551770
amt,float64,37256
job,object,478


In [37]:
display(Data.describe())

Unnamed: 0,is_fraud,city_pop,amt,cc_num,merch_lat,zip,lat,long,merch_long
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,0.00386,88221.89,69.39281,4.178387e+17,38.542798,48842.628015,38.543253,-90.231325,-90.23138
std,0.062008,300390.9,156.745941,1.309837e+18,5.095829,26855.283328,5.061336,13.72178,13.733071
min,0.0,23.0,1.0,60416210000.0,19.027422,1257.0,20.0271,-165.6723,-166.671575
25%,0.0,741.0,9.63,180042900000000.0,34.755302,26292.0,34.6689,-96.798,-96.905129
50%,0.0,2408.0,47.29,3521417000000000.0,39.376593,48174.0,39.3716,-87.4769,-87.445204
75%,0.0,19685.0,83.01,4635331000000000.0,41.954163,72011.0,41.8948,-80.1752,-80.264637
max,1.0,2906700.0,22768.11,4.992346e+18,66.679297,99921.0,65.6899,-67.9503,-66.952026


In [38]:
display(Data['is_fraud'].value_counts())

is_fraud
0    553574
1      2145
Name: count, dtype: int64

## Preprocessing

In [39]:
# In general some of the string fields (aka objects or categories) MAY no impact the ML modelling, but it seems to be true only trans_num
display(pd.concat([Data.dtypes.to_frame('Type'), Data.nunique().to_frame('Count')], axis=1).sort_values('Type'))

Unnamed: 0,Type,Count
zip,int64,912
is_fraud,int64,2
city_pop,int64,835
cc_num,int64,924
long,float64,910
lat,float64,910
merch_lat,float64,546490
merch_long,float64,551770
amt,float64,37256
job,object,478


## Train/Test Spliting

In [40]:
# We drop trans_num becuase it's unique has no impact on the model, and keep is_fraud  for the labels
X = Data.drop(['trans_num','is_fraud'], axis = 1)
y = Data['is_fraud']

# Transform training data and save category mappings for categorical columns
X = X.apply(lambda col: col.astype('category') if col.dtype == 'object' else col, axis = 0)

# The categories dictioanry used for the XGBoost predictions (sse the last section)
category_dict = {}
# Loop through each object-type column in X
for col in X.select_dtypes(include='category').columns:
    # Get the full set of all unique categories in this column
    category_dict[col] = X[col].cat.categories 

# Saving the dictionary to a file
with open('category_dict.pkl', 'wb') as file:
    pickle.dump(category_dict, file)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(444575, 18)
(111144, 18)


In [41]:
# Show a few examples and delete the dictionary from the notebook
print(category_dict['gender'])
print(category_dict['category'])
del(category_dict)

Index(['F', 'M'], dtype='object')
Index(['entertainment', 'food_dining', 'gas_transport', 'grocery_net',
       'grocery_pos', 'health_fitness', 'home', 'kids_pets', 'misc_net',
       'misc_pos', 'personal_care', 'shopping_net', 'shopping_pos', 'travel'],
      dtype='object')


## Training

In [42]:
# Convert the training data to DMatrix (for enable_categorical=True)
dtrain = xgboost.DMatrix(X_train, label=y_train, enable_categorical=True)
# Train the model using xgboost.Booster
params = {'random_state':42, 'max_depth':8, 'min_child_weight':0.5, 'gamma':3., 'reg_alpha':1., 'reg_lambda':1.}
xgboost_model = xgboost.train(params, dtrain, num_boost_round=100)


## Saving the model

In [43]:
xgboost_model.save_model('xgboost_model.json')
del(xgboost_model)

## Loading and Using

In [44]:
xgboost_model = xgboost.XGBClassifier()
xgboost_model.load_model('xgboost_model.json')

## Testing

In [45]:
# Calculate the predictions
y_pred = xgboost_model.predict(X_test)
# Show the test dataset fit
print("XG-Boost accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
print('---------------------')
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print('---------------------')
print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))

XG-Boost accuracy: 0.9976
---------------------
Confusion matrix:
[[110685     33]
 [   231    195]]
---------------------
Classification report:
              precision    recall  f1-score   support

           0     0.9979    0.9997    0.9988    110718
           1     0.8553    0.4577    0.5963       426

    accuracy                         0.9976    111144
   macro avg     0.9266    0.7287    0.7976    111144
weighted avg     0.9974    0.9976    0.9973    111144



In [46]:
# A few checks: the confusion matrix split in two 
y_ = y_test[y_test==0] 
X_ = X_test[y_test==0] 
print(confusion_matrix(y_, xgboost_model.predict(X_)))
y_ = y_test[y_test==1] 
X_ = X_test[y_test==1] 
print(confusion_matrix(y_, xgboost_model.predict(X_)))
del(y_, X_)

[[110685     33]
 [     0      0]]
[[  0   0]
 [231 195]]


## How to get a prediction for a datapoint in the json format

In [47]:
# The first non-zero prediction will be used for the procedure
y_pred = xgboost_model.predict(X)
i = np.argmax([y_pred == 1])
print(i)
print(y_pred[i])
print(xgboost_model.predict_proba(X[i:i+1]))

951
1
[[0.4786266 0.5213734]]


In [48]:
# The JSON file for a single data point
my_dict = X.iloc[i,:].to_dict()
print(my_dict)

{'state': 'MD', 'street': '35822 Clayton Street Apt. 679', 'city_pop': 5927, 'amt': 3204.98, 'job': 'Art therapist', 'cc_num': 4292743669224718067, 'city': 'Great Mills', 'merch_lat': 37.480372, 'zip': 20634, 'merchant': 'fraud_Haley, Jewess and Bechtelar', 'lat': 38.2674, 'dob': '1973-06-09', 'long': -76.4954, 'first': 'Michael', 'category': 'shopping_pos', 'last': 'Williams', 'gender': 'M', 'merch_long': -77.34958}


In [49]:
# One-row datarame with a single data point known to yield the 1-perdiction
df = pd.DataFrame([my_dict])
display(df)

Unnamed: 0,state,street,city_pop,amt,job,cc_num,city,merch_lat,zip,merchant,lat,dob,long,first,category,last,gender,merch_long
0,MD,35822 Clayton Street Apt. 679,5927,3204.98,Art therapist,4292743669224718067,Great Mills,37.480372,20634,"fraud_Haley, Jewess and Bechtelar",38.2674,1973-06-09,-76.4954,Michael,shopping_pos,Williams,M,-77.34958


In [50]:
# The values are the same, but ....
print((df == X.iloc[i:i+1, :].reset_index(drop=True)).all().all())

True


In [51]:
# ... the column types are different
print((df.dtypes.sort_index() == X.dtypes.sort_index()).all())

False


In [52]:
# Load the categorical dictionary
with open('category_dict.pkl', 'rb') as file:
    category_dict = pickle.load(file)

# Convert object columns to categorical columns
for col, categories in category_dict.items():
    if col in df.columns:
        # Convert the column in df to categorical using the categories from category_dict
        df[col] = pd.Categorical(df[col], categories=categories)

In [53]:
# The column types are the same now!
print((df.dtypes.sort_index() == X.dtypes.sort_index()).all())

True


In [54]:
print(xgboost_model.predict(df)) # As expected

[1]
