In [1]:
# Mount my Google Drive to get the data
from google.colab import drive
drive.mount('/content/drive')

import zipfile
import os
import pandas as pd
import numpy as np
import math
import tensorflow as tf

Mounted at /content/drive


In [None]:
# Unzips the file and extracts data
#!unzip '/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/card_transaction.v1.zip' -d '/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/card_transaction_dataset'

In [2]:
# Read and load the csv file
df = pd.read_csv('/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/card_transaction_dataset/card_transaction.v1.csv')
df = df[0:16500000]
print(df)

          User  Card  Year  Month  Day   Time   Amount            Use Chip  \
0            0     0  2002      9    1  06:21  $134.09   Swipe Transaction   
1            0     0  2002      9    1  06:42   $38.48   Swipe Transaction   
2            0     0  2002      9    2  06:22  $120.34   Swipe Transaction   
3            0     0  2002      9    2  17:45  $128.95   Swipe Transaction   
4            0     0  2002      9    3  06:23  $104.71   Swipe Transaction   
...        ...   ...   ...    ...  ...    ...      ...                 ...   
16499995  1338     1  2004      8   21  16:55   $38.26  Online Transaction   
16499996  1338     1  2004      8   22  10:49   $17.28   Swipe Transaction   
16499997  1338     1  2004      8   22  11:55   $40.29  Online Transaction   
16499998  1338     1  2004      8   23  11:50   $37.01  Online Transaction   
16499999  1338     1  2004      8   23  16:40   $27.14  Online Transaction   

                Merchant Name  Merchant City Merchant State    

In [3]:
# Set sequence length for multivariate time series
seq_length = 7

df['Merchant Name'] = df['Merchant Name'].astype(str)
df.sort_values(by=['User','Card'], inplace=True)
df.reset_index(inplace=True, drop=True)
print (df.info())

# Get first of each User-Card combination
first = df[['User','Card']].drop_duplicates()
f = np.array(first.index)

# Drop the first N transactions
drop_list = np.concatenate([np.arange(x,x + seq_length - 1) for x in f])
index_list = np.setdiff1d(df.index.values,drop_list)

# Split into 0.5 train, 0.3 validate, 0.2 test
tot_length = index_list.shape[0]
train_length = tot_length // 2
validate_length = (tot_length - train_length) * 3 // 5
test_length = tot_length - train_length - validate_length
print (tot_length,train_length,validate_length, test_length)

# Generate list of indices for train, validate, test
np.random.seed(1111)
train_indices = np.random.choice(index_list, train_length, replace=False)
tv_list = np.setdiff1d(index_list, train_indices)
validate_indices = np.random.choice(tv_list, validate_length, replace=False)
test_indices = np.setdiff1d(tv_list, validate_indices)
print(train_indices, validate_indices, test_indices)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16500000 entries, 0 to 16499999
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   object 
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(6), object(8)
memory usage: 1.8+ GB
None
16475254 8237627 4942576 3295051
[10564891  4293011 12990080 ...  7946973  8363815  6839602] [ 5898230  4785713  4951019 ...  3062006  6616067 11640215] [       6        8       27 ... 16499980 16499986 16499993]


In [4]:
# ----- CUSTOM MAPPING FUNCTIONS -----
def timeEncoder(X):
    X_hm = X['Time'].str.split(':', expand=True)
    d = pd.to_datetime(dict(year=X['Year'],month=X['Month'],day=X['Day'],hour=X_hm[0],minute=X_hm[1])).astype(int)
    return pd.DataFrame(d)

def amtEncoder(X):
    amt = X.apply(lambda x: x[1:]).astype(float).map(lambda amt: max(1,amt)).map(math.log)
    return pd.DataFrame(amt)

def decimalEncoder(X,length=5):
    dnew = pd.DataFrame()
    for i in range(length):
        dnew[i] = np.mod(X,10) 
        X = np.floor_divide(X,10)
    return dnew

def fraudEncoder(X):
    return np.where(X == 'Yes', 1, 0).astype(int)

In [5]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer

mapper = DataFrameMapper([('Is Fraud?', FunctionTransformer(fraudEncoder)),
                          (['Merchant State'], [SimpleImputer(strategy='constant'), FunctionTransformer(np.ravel),
                                               LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          (['Zip'], [SimpleImputer(strategy='constant'), FunctionTransformer(np.ravel),
                                     FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('Merchant Name', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('Merchant City', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('MCC', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          (['Use Chip'], [SimpleImputer(strategy='constant'), LabelBinarizer()]),
                          (['Errors?'], [SimpleImputer(strategy='constant'), LabelBinarizer()]),
                          (['Year','Month','Day','Time'], [FunctionTransformer(timeEncoder), MinMaxScaler()]),
                          ('Amount', [FunctionTransformer(amtEncoder), MinMaxScaler()])
                         ], input_df=True, df_out=True)
mapper.fit(df)

In [None]:
import joblib
mapper = joblib.load(open('/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/fitted_mapper.pkl','rb'))

In [None]:
mapped_sample = mapper.transform(df[:100])
mapped_size = mapped_sample.shape[-1]
print(mapped_size)

220


In [None]:
def gen_training_batch(df, mapper, index_list, batch_size):
    np.random.seed(98765)
    train_df = df.loc[index_list]
    non_fraud_indices = train_df[train_df['Is Fraud?'] == 'No'].index.values
    fraud_indices = train_df[train_df['Is Fraud?'] == 'Yes'].index.values
    fsize = fraud_indices.shape[0]
    while True:
        indices = np.concatenate((fraud_indices,np.random.choice(non_fraud_indices,fsize,replace=False)))
        np.random.shuffle(indices)
        rows = indices.shape[0]
        index_array = np.zeros((rows, seq_length), dtype=int)
        for i in range(seq_length):
            index_array[:,i] = indices + 1 - seq_length + i
        full_df = mapper.transform(df.loc[index_array.flatten()])
        target_buffer = full_df['Is Fraud?'].to_numpy().reshape(rows, seq_length, 1)
        data_buffer = full_df.drop(['Is Fraud?'],axis=1).to_numpy().reshape(rows, seq_length, -1)

        batch_ptr = 0
        while (batch_ptr + batch_size) <= rows:
            data = data_buffer[batch_ptr:batch_ptr+batch_size]
            targets = target_buffer[batch_ptr:batch_ptr+batch_size]
            batch_ptr += batch_size
            data_t = np.transpose(data, axes=(1,0,2))
            targets_t = np.transpose(targets, axes=(1,0,2))
            yield data_t,targets_t

In [None]:
class TP(tf.keras.metrics.TruePositives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class FP(tf.keras.metrics.FalsePositives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class FN(tf.keras.metrics.FalseNegatives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class TN(tf.keras.metrics.TrueNegatives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

In [None]:
units = [200,200]
input_size = mapped_size - 1
output_size = 1

batch_size = 16
tf_input = ([batch_size, input_size])

lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units[0], input_shape=tf_input, batch_size=7, time_major=True, return_sequences=True),
    #Trying to set return_sequences to false here
    tf.keras.layers.LSTM(units[1], return_sequences=True, time_major=True),
    tf.keras.layers.Dense(output_size, activation='sigmoid')
])

lstm_model.summary()
tf.keras.utils.plot_model(lstm_model, '/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/model.png', show_shapes=True)

metrics=['accuracy', 
    TP(name='TP'),
    FP(name='FP'),
    FN(name='FN'),
    TN(name='TN'),
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.FalseNegatives(name='fn'),
    tf.keras.metrics.TrueNegatives(name='tn')
   ]

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (7, 16, 200)              336000    
                                                                 
 lstm_9 (LSTM)               (7, 16, 200)              320800    
                                                                 
 dense_4 (Dense)             (7, 16, 1)                201       
                                                                 
Total params: 657,001
Trainable params: 657,001
Non-trainable params: 0
_________________________________________________________________


In [None]:
steps_per_epoch = 10000
checkpoint_dir = "/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/checkpoints/"
filepath = checkpoint_dir + "iter-{epoch:02d}/model.ckpt"
batch_size = 16

In [None]:
import warnings
warnings.filterwarnings("ignore")

print ("Learning...")
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, save_weights_only=True, verbose=1)
train_generate = gen_training_batch(df,mapper,train_indices,batch_size)
lstm_model.fit(train_generate, epochs=5, steps_per_epoch=steps_per_epoch, verbose=1, callbacks=[cp_callback])

Learning...
Epoch 1/5
Epoch 1: saving model to /content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/checkpoints/iter-01/model.ckpt
Epoch 2/5

In [None]:
lstm_model.save('/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/lstm_model_5_epochs_new.h5')

In [None]:
from tensorflow import keras
from keras.models import load_model

dependencies = {
    'TP': TP,
    'FP': FP,
    'TN': TN,
    'FN': FN
}

model = load_model('/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/lstm_model_5_epochs.h5', custom_objects=dependencies)

In [None]:
def gen_test_batch(df, mapper, indices, batch_size):
    rows = indices.shape[0]
    index_array = np.zeros((rows, seq_length), dtype=np.int)
    for i in range(seq_length):
        index_array[:,i] = indices + 1 - seq_length + i
    count = 0
    while (count + batch_size <= rows):        
        full_df = mapper.transform(df.loc[index_array[count:count+batch_size].flatten()])
        data = full_df.drop(['Is Fraud?'],axis=1).to_numpy().reshape(batch_size, seq_length, -1)
        targets = full_df['Is Fraud?'].to_numpy().reshape(batch_size, seq_length, 1)
        count += batch_size
        data_t = np.transpose(data, axes=(1,0,2))
        targets_t = np.transpose(targets, axes=(1,0,2))
        yield data_t, targets_t

In [None]:
batch_size = 16

print("\nFull test")
test_generate = gen_test_batch(df,mapper,test_indices,batch_size)
lstm_model.evaluate(test_generate, verbose=1)

#**ADABOOST**

In [6]:
def gen_training_batch(df, mapper, index_list, batch_size):
    np.random.seed(98765)
    train_df = df.loc[index_list]
    non_fraud_indices = train_df[train_df['Is Fraud?'] == 'No'].index.values
    fraud_indices = train_df[train_df['Is Fraud?'] == 'Yes'].index.values
    fsize = fraud_indices.shape[0]
    while True:
        indices = np.concatenate((fraud_indices,np.random.choice(non_fraud_indices,fsize,replace=False)))
        np.random.shuffle(indices)
        rows = indices.shape[0]
        index_array = np.zeros((rows, seq_length), dtype=int)
        for i in range(seq_length):
            index_array[:,i] = indices + 1 - seq_length + i
        full_df = mapper.transform(df.loc[index_array.flatten()])
        target_buffer = full_df['Is Fraud?'].to_numpy().reshape(rows, seq_length, 1)
        data_buffer = full_df.drop(['Is Fraud?'],axis=1).to_numpy().reshape(rows, seq_length, -1)

        batch_ptr = 0
        while (batch_ptr + batch_size) <= rows:
            data = data_buffer[batch_ptr:batch_ptr+batch_size]
            targets = target_buffer[batch_ptr:batch_ptr+batch_size]
            batch_ptr += batch_size
            data_t = np.transpose(data, axes=(1,0,2))
            targets_t = np.transpose(targets, axes=(1,0,2))
            yield data_t.reshape(-1, data_t.shape[-1]), targets_t.reshape(-1)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
# Train multiple models using Adaboost
ada = AdaBoostClassifier(n_estimators=10, learning_rate=5)
print(ada)

# Fit the classifier using the training batch
num_epochs = 50
for i in range(num_epochs):
    X_batch, y_batch = next(gen_training_batch(df, mapper, train_indices, 16))
    ada.fit(X_batch, y_batch)

AdaBoostClassifier(learning_rate=5, n_estimators=10)


In [None]:
from joblib import dump, load
print(ada)

# Save the fitted model to a file
dump(ada, '/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/adaboost_model_50epochs_10estimators_5LR.joblib')

AdaBoostClassifier(learning_rate=5, n_estimators=10)


['/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/adaboost_model_50epochs_10estimators_5LR.joblib']

In [7]:
from joblib import dump, load
from sklearn.ensemble import AdaBoostClassifier
# Load the AdaBoost model that was saved
ada = load('/content/drive/MyDrive/University of Toronto Elec & Comp Eng/Capstone/saved_models/adaboost_model_50epochs_10estimators.joblib')

In [8]:
def evaluate_model(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)
    
    return tp, tn, fp, fn

In [9]:
def gen_test_batch(df, mapper, indices, batch_size):
    rows = indices.shape[0]
    index_array = np.zeros((rows, seq_length), dtype=np.int)
    for i in range(seq_length):
        index_array[:,i] = indices + 1 - seq_length + i
    count = 0
    while (count + batch_size <= rows):        
        full_df = mapper.transform(df.loc[index_array[count:count+batch_size].flatten()])
        data = full_df.drop(['Is Fraud?'],axis=1).to_numpy().reshape(batch_size, seq_length, -1)
        targets = full_df['Is Fraud?'].to_numpy().reshape(batch_size, seq_length, 1)
        count += batch_size
        data_t = np.transpose(data, axes=(1,0,2))
        targets_t = np.transpose(targets, axes=(1,0,2))
        yield data_t.reshape(-1, data_t.shape[-1]), targets_t.reshape(-1)

# Generate test batches
test_batches = gen_test_batch(df, mapper, test_indices, batch_size=16)

# Evaluate the model on the test data
total_accuracy = 0
total_fp_rate = 0
total_tp_rate = 0
total_tn_rate = 0
total_fn_rate = 0
num_batches = 1000  # number of test batches to evaluate
total_actual_fraud = 0
total_actual_non_fraud = 0

# --------------------------- Time Metrics ---------------------------
import time
total_time_pred = 0


for i in range(num_batches):
    X_batch, y_batch = next(test_batches)
    # Make all predictions above 0.35 into positive predictions
    start_time = time.time()
    y_pred = (ada.predict_proba(X_batch)[:, 1] >= 0.5).astype(int)
    end_time = time.time()
    total_time_pred += (end_time - start_time)
    accuracy = np.mean(y_pred == y_batch)
    total_accuracy += accuracy
    tp_rate, tn_rate, fp_rate, fn_rate = evaluate_model(y_batch.reshape(-1), y_pred)

    for x in y_batch.reshape(-1):
      if x == 0:
        total_actual_non_fraud += 1
      elif x == 1:
        total_actual_fraud += 1

    total_fp_rate += fp_rate
    total_tp_rate += tp_rate
    total_tn_rate += tn_rate
    total_fn_rate += fn_rate
    print("Batch {}: accuracy = {:.4f}, true positive rate = {:.4f}, false positives rate = {:.4f}, true negative rate = {:.4f}, false negative rate = {:.4f}".format(i+1, accuracy, tp_rate, fp_rate, tn_rate, fn_rate))
    fraud_count = np.sum(y_batch)
    non_fraud_count = y_batch.shape[0] - fraud_count
    print("Batch {}: fraud count = {}, non-fraud count = {}".format(i+1, fraud_count, non_fraud_count))

avg_accuracy = total_accuracy / num_batches
avg_fp_rate = total_fp_rate / num_batches
avg_tp_rate = total_tp_rate / num_batches
avg_fn_rate = total_fn_rate / num_batches
avg_tn_rate = total_tn_rate / num_batches
print("Average accuracy on test data: {:.4f}".format(avg_accuracy))
'''
print("Average false positives rate on test data: {:.4f}".format(avg_fp_rate))
print("Average true positives rate on test data: {:.4f}".format(avg_tp_rate))
print("Average false negatives rate on test data: {:.4f}".format(avg_fn_rate))
print("Average true negatives rate on test data: {:.4f}".format(avg_tn_rate))
'''
print("The number of transactions processed per second is: ", ((16*7*num_batches) / total_time_pred))

print("The total number of false positives from the set is: ", total_fp_rate)
print("The total number of false negatives from the set is: ", total_fn_rate)
print("The total number of true positives from the set is: ", total_tp_rate)
print("The total number of true negatives from the set is: ", total_tn_rate)

precision = total_tp_rate / (total_tp_rate + total_fp_rate)
recall = total_tp_rate / (total_tp_rate + total_fn_rate)
f1_score = 2 * (precision * recall) / (precision + recall)
print("The precision score from the set is: ", precision)
print("The recall score from the set is: ", recall)
print("The f1 score from the set is: ", f1_score)

fp_rate = total_fp_rate / (total_fp_rate + total_tn_rate)
print("The false positive rate is: ", fp_rate)

fn_rate = total_fn_rate / (total_fn_rate + total_tp_rate)
print("The false negative rate is: ", fn_rate)

print('The actual number of non-fraud in this set is: ', total_actual_non_fraud)
print('The actual number of fraud in this set is: ', total_actual_fraud)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  index_array = np.zeros((rows, seq_length), dtype=np.int)


Batch 1: accuracy = 0.9821, true positive rate = 0.0000, false positives rate = 2.0000, true negative rate = 110.0000, false negative rate = 0.0000
Batch 1: fraud count = 0, non-fraud count = 112
Batch 2: accuracy = 1.0000, true positive rate = 0.0000, false positives rate = 0.0000, true negative rate = 112.0000, false negative rate = 0.0000
Batch 2: fraud count = 0, non-fraud count = 112
Batch 3: accuracy = 1.0000, true positive rate = 0.0000, false positives rate = 0.0000, true negative rate = 112.0000, false negative rate = 0.0000
Batch 3: fraud count = 0, non-fraud count = 112
Batch 4: accuracy = 0.9464, true positive rate = 0.0000, false positives rate = 6.0000, true negative rate = 106.0000, false negative rate = 0.0000
Batch 4: fraud count = 0, non-fraud count = 112
Batch 5: accuracy = 0.9554, true positive rate = 0.0000, false positives rate = 5.0000, true negative rate = 107.0000, false negative rate = 0.0000
Batch 5: fraud count = 0, non-fraud count = 112
Batch 6: accuracy = 