In [None]:
# standard snippets I use a lot

# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

In [None]:
# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)

# set up a database with credentials
client = MongoClient(creds['connection_string'])

In [None]:
# download all events from the mongo database
all_events_json = list(client['production']['eventCollection'].find())

In [None]:
# flatten the events
all_events_flat = json_normalize(all_events_json)

# convert to a pandas dataframe
all_events = pd.DataFrame(all_events_flat)

In [None]:
# save the data to a file so if the kernel crashes you don't need to re-read from the database
# need to have a data directory inside notebooks
all_events = all_events.to_csv('data/all_events.csv', index=False)

In [None]:
# read the saved data instead of reading from the database
all_events = pd.read_csv('data/all_events.csv', low_memory=False)

In [None]:
all_events.head()

In [None]:
# rename the older 'bitcoin' events to BTC so they match
all_events.loc[all_events.eventLabel.str.lower() == 'bitcoin', 'eventLabel'] = 'BTC'

# convert created to a datetime instead of a string
all_events['created'] = pd.to_datetime(all_events['created'])

In [None]:
# get all of the interac requests
interac_requests = all_events[(all_events['metadata.email'].isnull() == False) & (all_events.eventCategory == 'buy') & (all_events.eventAction == 'request')].reset_index(drop=True)

def flagFraudsters(df):

    blemails = list(pd.DataFrame(json_normalize(list(client['production']['emailBlacklistCollection'].find({'level': 'BLOCKED'})))).email) + ['gaelkevin@hotmail.com', 'royer.8383@gmail.com','adventurous7381@gmail.com']
    
    df['fraud'] = df['metadata.email'].isin(blemails)
    
    return df

def removeWhitelistRecords(df):

    wlemails = pd.DataFrame(json_normalize(list(client['production']['emailWhitelistCollection'].find({'level': 'ALLOWED'})))).email
    
    df = df[df['metadata.email'].str.contains('test') == False]
    df = df[df['metadata.email'].str.contains('fingerfoodstudios') == False]
    df = df[df['metadata.email'].str.contains('einstein.exchange') == False]    
    df = df[df['metadata.email'].isin(wlemails) == False]
    
    return df 

# flag the fraudulnet records and remove the whitelist and test accounts
interac_requests = removeWhitelistRecords(flagFraudsters(interac_requests))

display(interac_requests.head())
interac_requests.shape

In [None]:
result = interac_requests[['_id','metadata.email','created','metadata.amount','metadata.rate','value']]

def subset_by_request(row):
    
    # get the email from the row
    email = row['metadata.email']
    
    # get the created time from the row
    created = row['created']
    
    # get the id from the row
    _id = row['_id']
    
    # time to lookback an hour prior to the request
    lookbacktime = created-datetime.timedelta(seconds=60*60) # one hour 
    
    # get the events for this user where the time is before the request but not later than an hour before the request
    events = all_events[(all_events['metadata.email'] == email) & (all_events['created'] <= created) & (all_events['created'] >= lookbacktime)]
    
    # give them a request_id for later group by operations
    events['request_id'] = _id
    
    # convert the dataframe to a list of json records
    return events.to_dict(orient='records')

In [None]:
# convert the interac results to a dict
result_dict = result.to_dict(orient='records')

In [None]:
# extract the list of events for each request
subsets = list(map(subset_by_request, result_dict))

In [None]:
len(subsets)

In [None]:
# flatten the subsets so they aren't nested
subsets_flat = [item for sublist in subsets for item in sublist]

In [None]:
# create a dataframe with the results
df_with_id = pd.DataFrame(subsets_flat)

# create the combined category action and category-action-label fields
df_with_id['ca'] = df_with_id.eventCategory + '_' + df_with_id.eventAction
df_with_id['cal'] = df_with_id.eventCategory + '_' + df_with_id.eventAction+ '_' + df_with_id.eventLabel

# sort by request id and date
df_with_id = df_with_id.sort_values(by=['request_id','created'])

# calculate the previous event time and the time between events
df_with_id['previous_event_time'] = df_with_id.groupby(['_id'])['created'].shift(1)
df_with_id['time_since_last_event'] = pd.to_numeric(df_with_id['created']-df_with_id['previous_event_time'])*1e-9

# replace string versions of infinity with proper inf object
df_with_id = df_with_id.replace('Infinity', np.inf)

# convert columns that should be to numeric
df_with_id['metadata.amount'] = pd.to_numeric(df_with_id['metadata.amount'])
df_with_id['metadata.rate'] = pd.to_numeric(df_with_id['metadata.rate'])
df_with_id['metadata.cents'] = pd.to_numeric(df_with_id['metadata.cents'])
df_with_id['value'] = pd.to_numeric(df_with_id['value'])

df_with_id.head()

In [None]:
# print out all the available columns

cols = []

for col in sorted(list(df_with_id.columns)):
    cols.append({'column': col, 'data_type': df_with_id[col].dtype})
    
with pd.option_context('display.max_rows', 500):
    display(pd.DataFrame(cols))

In [None]:
# choose the columns to keep
columns_to_keep = ['request_id',
                   'metadata.email', 
                   'created', 
                   'ca',
                   'cal', 
                   'eventAction', 
                   'eventCategory', 
                   'eventLabel', 
                   'metadata.addressCity', 
                   'metadata.addressCountry', 
                   'metadata.addressProvince', 
                   'metadata.amount', 
                   'metadata.cents', 
                   'metadata.city', 
                   'metadata.country', 
                   'metadata.currency', 
                   'metadata.instrument', 
                   #'metadata.ip', 
                   'metadata.lastTradedPx', 
                   'metadata.mongoResponse.amount', 
                   #'metadata.mongoResponse.email', 
                   'metadata.mongoResponse.price', 
                   'metadata.mongoResponse.product', 
                   'metadata.price', 
                   'metadata.product', 
                   'metadata.prossessorError.billingDetails.city', 
                   'metadata.prossessorError.billingDetails.country', 
                   'metadata.prossessorError.billingDetails.state', 
                   'metadata.prossessorError.card.type', 
                   'metadata.prossessorResponse.billingDetails.city', 
                   'metadata.prossessorResponse.billingDetails.country', 
                   'metadata.prossessorResponse.billingDetails.province', 
                   'metadata.prossessorResponse.billingDetails.state', 
                   'metadata.prossessorResponse.card.type', 
                   'metadata.prossessorResponse.cardType', 
                   'metadata.prossessorResponse.card_type', 
                   'metadata.prossessorResponse.charge_amount', 
                   #'metadata.prossessorResponse.email', 
                   'metadata.province', 
                   'metadata.rate', 
                   'metadata.requestParams.amount', 
                   'metadata.requestParams.charge_amount', 
                   'metadata.requestParams.currency', 
                   #'metadata.requestParams.email', 
                   'metadata.requestParams.price', 
                   'metadata.requestParams.product', 
                   'metadata.requestParams.product_amount', 
                   'metadata.secondAmount', 
                   'metadata.tradesResponse', 
                   'metadata.type', 
                   #'previous_event_time',  
                   'time_since_last_event', 
                   'value']

# choose the columns to expand
columns_to_expand = ['ca', 
                     'cal', 
                     'eventAction', 
                     'eventCategory', 
                     'eventLabel', 
                     'metadata.addressCity', 
                     'metadata.addressCountry', 
                     'metadata.addressProvince', 
                     'metadata.city', 
                     'metadata.country', 
                     'metadata.currency', 
                     'metadata.instrument', 
                     'metadata.mongoResponse.product', 
                     'metadata.product', 
                     'metadata.prossessorError.billingDetails.city', 
                     'metadata.prossessorError.billingDetails.country', 
                     'metadata.prossessorError.billingDetails.state', 
                     'metadata.prossessorError.card.type', 
                     'metadata.prossessorResponse.billingDetails.city', 
                     'metadata.prossessorResponse.billingDetails.country', 
                     'metadata.prossessorResponse.billingDetails.province', 
                     'metadata.prossessorResponse.billingDetails.state', 
                     'metadata.prossessorResponse.card.type', 
                     'metadata.prossessorResponse.cardType', 
                     'metadata.prossessorResponse.card_type', 
                     'metadata.province', 
                     'metadata.requestParams.currency', 
                     'metadata.requestParams.product', 
                     'metadata.tradesResponse', 
                     'metadata.type']

In [None]:
# get the subset of columns
subset = df_with_id[columns_to_keep]

# fill inf values with na        
subset = subset.replace([np.inf, -np.inf], np.nan)

# create columns to track na status of each column
for column in subset.columns:
    if column not in ['request_id','metadata.email','created']:
        subset[column+"_na"] = subset[column].isna()
        
# convert categorical columns to binary
subset = pd.get_dummies(subset, columns=columns_to_expand)

# fill na values with 0
subset = subset.fillna(0)

# convert the datetime to integer nanoseconds since 1970
subset['created_utc_ns'] = pd.to_numeric(subset['created'])

# sort ascending by request_id and descending by time
subset = subset.sort_values(by=['request_id','created'], ascending=[True, False])

# drop the time and email columns
subset = subset.drop(['metadata.email','created'], axis = 1)
subset = subset.reset_index(drop=True)

# hacky way to number the sequence events prior to each interac request with 0 being the request and 10 being the 10th event prior to the request
subset['index_int'] = subset.index
event_index = subset.groupby('request_id')['index_int'].agg(lambda x: list(np.abs(min(x)-x)))
all_index = [[item] if type(item) == type(int) else list(item) for item in event_index]
all_index = [item for sublist in all_index for item in sublist]
subset['timesteps'] = all_index

# gather the columns together
melt = subset.melt(id_vars=['request_id','timesteps'])

# number of features is the number of columns at each timestep
n_features = len(melt.variable.unique())

# create new variable names that contain the timesteps
melt['variable'] = melt.timesteps.astype(str) + "_" + melt['variable']

# drop the timesteps column so it doesn't end up as a feature column
melt = melt.drop('timesteps', axis=1)

# convert the value to a numeric value - not sure why this happened
melt['value'] = pd.to_numeric(melt.value)

# spread the values out so there's one row per request the columns contain the timesteps
one_row_per_request = melt.pivot_table(values='value', index=['request_id'], columns=['variable']).reset_index()

# fill NA values for timesteps with zeros
one_row_per_request = one_row_per_request.fillna(0)

# get the labels and groups for a GroupKFold cross validation.
fraud = interac_requests.fraud
emails = interac_requests['metadata.email']

n_examples = one_row_per_request.shape[0]

n_timesteps = (one_row_per_request.shape[1] - 1) / n_features

print("Exampes:",n_examples, "Features:", n_features, "Timesteps:",n_timesteps)

one_row_per_request.head()

# get the values of the array
data = one_row_per_request.drop('request_id', axis=1).values

# reshape the data from (n_example, n_features*n_timesteps) to (n_examples, n_timesteps, n_features)
data = data.reshape((n_examples, int(n_timesteps), n_features))

In [None]:
# print the number of interac requests per fraudulent user
for email in emails[fraud == True].unique():
    print(email, np.sum(fraud[emails == email] == True))

## Example Modeling Code - Doesn't Currently Work

In [None]:
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import average_precision_score

%matplotlib inline
from matplotlib import pyplot
from keras.optimizers import adam

In [None]:
# data is the 3d matrix
X = data

# labels are fraud
y = fraud

# groups for splitting are user emails
groups = emails

# to keep track of average precision score for each split
average_precisions = []

# Split the data stratified by user/email address so there's no cross contamination.
# number of splits is 3 because there's only 3 fraudulent users
group_kfold = GroupKFold(n_splits=3)

print('Producing {} splits of the data'.format(group_kfold.get_n_splits(X=X, y=y, groups=emails)))

# cross validation
for train_index, test_index in group_kfold.split(X=X, y=y, groups=groups):

    X_train = X[train_index,:,:]
    X_test = X[test_index,:,:]
    y_train = y[train_index]
    y_test = y[test_index]
            
    opt = adam(lr=1)

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(10, input_shape=(n_timesteps, n_features), return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    print(model.summary())

    history = model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test), batch_size=50)
    pyplot.plot(history.history['loss'])
    pyplot.plot(history.history['val_loss'])
    pyplot.title('model train vs validation loss')
    pyplot.ylabel('loss')
    pyplot.xlabel('epoch')
    pyplot.legend(['train', 'validation'], loc='upper right')
    pyplot.show()
    
#     probs_test = model.predict_proba(X_test, )
#     print(probs_test)
    
#     average_precision_score = average_precision_score(y_true=y_test, y_score=probs_test)
    
#     average_precisions.append(average_precision_score)
    
# print("Mean Average Precision Score:", np.mean(average_precisions))