## Libraries

In [1]:
import requests, json
import os
import numpy as np
import pandas as pd

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

import xgboost

### Find what columns are used in the current API request

In [2]:
url = 'https://real-time-payments-api.herokuapp.com/current-transactions'
headers = {'accept': 'application/json'}

# Send the GET request
response = requests.get(url, headers=headers)

# Check the response status and print the result
if response.status_code == 200:
    # what columns are there in an API request
    columns_in_api = json.loads(response.json())['columns']

In [3]:
print(columns_in_api)

['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'merch_lat', 'merch_long', 'is_fraud', 'current_time']


## Downloading

In [4]:
Data = pd.read_csv('../fraudTest.csv', index_col=0) # nrows=10000, 
print(Data.shape)

(555719, 22)


## Exploratory Data Analysis

In [5]:
print(sorted(Data.columns))

['amt', 'category', 'cc_num', 'city', 'city_pop', 'dob', 'first', 'gender', 'is_fraud', 'job', 'last', 'lat', 'long', 'merch_lat', 'merch_long', 'merchant', 'state', 'street', 'trans_date_trans_time', 'trans_num', 'unix_time', 'zip']


In [6]:
# Missing in the API
print(list(set(Data.columns) - set(columns_in_api)))
# Missing in the database
print(list(set(columns_in_api) - set(Data.columns)))
# Total in both
print(len(list(set(columns_in_api) & set(Data.columns))))
# is_fraud is in both
print('is_fraud' in (set(columns_in_api) & set(Data.columns)))

['trans_date_trans_time', 'unix_time']
['current_time']
20
True


In [7]:
Data = Data[list(set(columns_in_api) & set(Data.columns))]
print(Data.shape)

(555719, 20)


In [8]:
display(pd.concat([Data.dtypes.to_frame('Type'),Data.nunique().to_frame('Count')], axis=1).sort_values('Type'))

Unnamed: 0,Type,Count
is_fraud,int64,2
zip,int64,912
city_pop,int64,835
cc_num,int64,924
amt,float64,37256
merch_long,float64,551770
merch_lat,float64,546490
lat,float64,910
long,float64,910
dob,object,910


In [9]:
display(Data.describe())

Unnamed: 0,long,lat,cc_num,city_pop,amt,zip,is_fraud,merch_long,merch_lat
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,-90.231325,38.543253,4.178387e+17,88221.89,69.39281,48842.628015,0.00386,-90.23138,38.542798
std,13.72178,5.061336,1.309837e+18,300390.9,156.745941,26855.283328,0.062008,13.733071,5.095829
min,-165.6723,20.0271,60416210000.0,23.0,1.0,1257.0,0.0,-166.671575,19.027422
25%,-96.798,34.6689,180042900000000.0,741.0,9.63,26292.0,0.0,-96.905129,34.755302
50%,-87.4769,39.3716,3521417000000000.0,2408.0,47.29,48174.0,0.0,-87.445204,39.376593
75%,-80.1752,41.8948,4635331000000000.0,19685.0,83.01,72011.0,0.0,-80.264637,41.954163
max,-67.9503,65.6899,4.992346e+18,2906700.0,22768.11,99921.0,1.0,-66.952026,66.679297


In [11]:
display(Data['is_fraud'].value_counts())

is_fraud
0    553574
1      2145
Name: count, dtype: int64

## Train/Test Spliting

In [12]:
X = Data.drop('is_fraud', axis = 1)
y = Data['is_fraud']

# Transform training data and save category mappings for categorical columns
X = X.apply(lambda col: col.astype('category') if col.dtype == 'object' else col, axis = 0)

category_dict = {}
# Loop through each object-type column in X
for col in X.select_dtypes(include='category').columns:
    # Get the full set of all unique categories in this column
    category_dict[col] = X[col].cat.categories 

# Saving the dictionary to a file
with open('category_dict.pkl', 'wb') as file:
    pickle.dump(category_dict, file)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(444575, 19)
(111144, 19)


In [13]:
# Show a few examples and delete the dictionary from the notebook
print(category_dict['gender'])
print(category_dict['category'])
del(category_dict)

Index(['F', 'M'], dtype='object')
Index(['entertainment', 'food_dining', 'gas_transport', 'grocery_net',
       'grocery_pos', 'health_fitness', 'home', 'kids_pets', 'misc_net',
       'misc_pos', 'personal_care', 'shopping_net', 'shopping_pos', 'travel'],
      dtype='object')


## Training

In [None]:
# Convert the training data to DMatrix (for enable_categorical=True)
dtrain = xgboost.DMatrix(X_train, label=y_train, enable_categorical=True)
# Train the model using xgboost.Booster
params = {'random_state':42, 'max_depth':8, 'min_child_weight':0.5, 'gamma':3., 'reg_alpha':1., 'reg_lambda':1.}
xgb_model = xgboost.train(params, dtrain, num_boost_round=100)



# y_pred = xg_boost_model.predict(X_test)

# print("XG-Boost accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
# print('---------------------')
# print("Confusion matrix:")
# print(confusion_matrix(y_test, y_pred))
# print('---------------------')
# print("Classification report:")
# print(classification_report(y_test, y_pred, digits=4))

Parameters: { "n_estimators" } are not used.



## Saving the model

In [15]:
xg_boost_model.save_model('xgb_model.json')
del(xg_boost_model)

## Loading and Using

In [16]:
xg_boost_model = xgboost.XGBClassifier()
xg_boost_model.load_model('xgb_model.json')

In [17]:
# A few checks
y_ = y_test[y_test==0] 
X_ = X_test[y_test==0] 
print(confusion_matrix(y_, xg_boost_model.predict(X_)))
y_ = y_test[y_test==1] 
X_ = X_test[y_test==1] 
print(confusion_matrix(y_, xg_boost_model.predict(X_)))
del(y_, X_)

[[110678     40]
 [     0      0]]
[[  0   0]
 [157 269]]


In [18]:
# The first non-zero prediction
y_pred = xg_boost_model.predict(X)
i = np.argmax([y_pred == 1])
print(i)
print(y_pred[i])
print(xg_boost_model.predict_proba(X[i:i+1]))

1695
1
[[0.29483795 0.70516205]]


In [19]:
# The JSON file for a single data point

my_dict = X.iloc[i,:].to_dict()
print(my_dict)

{'cc_num': 4427805710168, 'amt': 890.22, 'zip': 88355, 'job': 'Licensed conveyancer', 'city_pop': 8874, 'long': -105.6933, 'street': '1742 Brandon Squares Apt. 461', 'lat': 33.3305, 'trans_num': '670e9fd071e3745d6d90b5ee5f29c64c', 'merch_long': -106.551531, 'first': 'Michelle', 'merchant': 'fraud_Lebsack and Sons', 'category': 'misc_net', 'city': 'Ruidoso', 'gender': 'F', 'dob': '2000-08-16', 'last': 'Rodriguez', 'merch_lat': 34.154798, 'state': 'NM'}


In [20]:
# One-row datarame with a single data point known to yield the 1-perdiction
df = pd.DataFrame([my_dict])
display(df)

Unnamed: 0,cc_num,amt,zip,job,city_pop,long,street,lat,trans_num,merch_long,first,merchant,category,city,gender,dob,last,merch_lat,state
0,4427805710168,890.22,88355,Licensed conveyancer,8874,-105.6933,1742 Brandon Squares Apt. 461,33.3305,670e9fd071e3745d6d90b5ee5f29c64c,-106.551531,Michelle,fraud_Lebsack and Sons,misc_net,Ruidoso,F,2000-08-16,Rodriguez,34.154798,NM


In [21]:
print((df == X.iloc[i:i+1, :].reset_index(drop=True)).all().all())

True


In [22]:
# The column types are different
print((df.dtypes.sort_index() == X.dtypes.sort_index()).all())

False


In [23]:
# Load the categorical dictionary
with open('category_dict.pkl', 'rb') as file:
    category_dict = pickle.load(file)

# Convert object columns to categorical columns
for col, categories in category_dict.items():
    if col in df.columns:
        # Convert the column in df to categorical using the categories from category_dict
        df[col] = pd.Categorical(df[col], categories=categories)

In [24]:
# The column types are the same
print((df.dtypes.sort_index() == X.dtypes.sort_index()).all())

True


In [25]:
print(xg_boost_model.predict(df)) # As expected

[0]


In [35]:
X_ = X.iloc[i:i+1, :].reset_index(drop=True)
print((df == X_).all().all())
print((df.columns == X_.columns).all())
print(df.index.equals(X_.index))

print(xg_boost_model.predict_proba(X_))
print(xg_boost_model.predict_proba(df[X_.columns]))
print(xg_boost_model.predict_proba(df))
print(xg_boost_model.predict_proba(X_.copy()))

True
True
True
[[0.29483795 0.70516205]]
[[0.9710381 0.0289619]]
[[0.9710381 0.0289619]]
[[0.29483795 0.70516205]]


In [36]:
xg_boost_model_ = xg_boost_model
print(xg_boost_model_.predict_proba(df))


[[0.9710381 0.0289619]]


In [30]:
X_.dtypes


cc_num           int64
amt            float64
zip              int64
job           category
city_pop         int64
long           float64
street        category
lat            float64
trans_num     category
merch_long     float64
first         category
merchant      category
category      category
city          category
gender        category
dob           category
last          category
merch_lat      float64
state         category
dtype: object