In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import time
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import math

from einsteinds import db as edb
from einsteinds import event_processing
from einsteinds import ml
from einsteinds import plots
from einsteinds import utils


from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, scale
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers


clean_events = event_processing.clean_events

# load the database credentials from file
with open('../creds/local_creds.json') as json_data:
    creds = json.load(json_data)
    
db = edb.Database(creds)

In [None]:
# get the request sets for the month of January and February
request_sets = db.get_deposit_request_sets(start_date=datetime.datetime(2018,1,1), end_date=datetime.datetime(2018,3,1))

In [None]:
# cleans the request event and the previous hour's events in the request sets
request_sets_clean = event_processing.clean_request_sets(request_sets)

In [None]:
# create sets of records that contain only numerical features for feeding into a recurrent neural network
recurrent_summaries = event_processing.create_recurrent_request_summaries(request_sets_clean)

In [None]:
# flatten them into just a big list of events
recurrent_events = [event for summary in recurrent_summaries for event in summary]

In [None]:
df = pd.DataFrame(recurrent_events)

In [None]:
df

In [None]:
X = interac_data_dict['data']
y = interac_data_dict['labels'].values
groups = interac_data_dict['users'].values
feature_names = interac_data_dict['feature_names']

x_shape = X.shape

n_examples = x_shape[0]
n_timesteps = x_shape[1]
n_features = x_shape[2]

In [None]:
import math

def balanced_class_group_splits(X, y, groups):

    emails = groups
    
    all_examples = pd.DataFrame({'requests': y, 'users': groups})

    frauds = all_examples[all_examples.requests == 1]
    not_frauds = all_examples[all_examples.requests == 0]

    user_counts = frauds.groupby('users', as_index=False)['requests'].count().sort_values(by='requests', ascending=False).reset_index(drop=True)

    display(user_fraud_counts)

    print("Total Fraudulent requests", user_counts.requests.sum())
    print("Most Fraudulent requests by a single user:", user_counts.requests[0])
    print("Max Number of Splits",math.ceil(user_counts.requests.sum()/user_counts.requests[0]))

    max_splits = math.ceil(user_counts.requests.sum()/user_counts.requests[0])
    max_group_size = math.ceil(user_counts.requests.sum()/max_splits)
    groups = {}

    print("Maximum Group Size", max_group_size)

    split_order = np.arange(max_splits)
    splits = split_order + 1
    split_counts = np.zeros((max_splits))

    for i in range(user_counts.shape[0]):

        for split in splits:#[split_counts.argsort()]:
            if split not in groups.keys():
                groups[split] = {'users': [], 'requests': 0}

            current_group_size = groups[split]['requests']
            remaining_space = max_group_size - current_group_size

            if i == 0:
                groups[split]['users'].append(user_counts.users[i])
                groups[split]['requests'] += user_counts.requests[i]
                groups[split]['user_count'] = len(groups[split]['users'])
                break

            elif (current_group_size < max_group_size) & (remaining_space >= user_counts.requests[i]):
                groups[split]['users'].append(user_counts.users[i])
                groups[split]['requests'] += user_counts.requests[i]
                groups[split]['user_count'] = len(groups[split]['users'])

                #split_counts[split-1] = groups[split]['requests']/len(groups[split]['users'])

                break
    
    positive_groups = groups
    
    user_counts = not_frauds.groupby('users', as_index=False)['requests'].count().sort_values(by='requests', ascending=False).reset_index(drop=True)

    display(user_counts.head())

    print("Total Not Fraudulent equests", user_counts.requests.sum())
    print("Most Requests by a single user:", user_counts.requests[0])
    print("Max number of splits",math.floor(user_counts.requests.sum()/user_counts.requests[0]))

    max_group_size = math.ceil(user_counts.requests.sum()/max_splits)
    groups = {}

    print("Maximum Group Size", max_group_size)

    split_order = np.arange(max_splits)
    splits = split_order + 1
    split_counts = np.zeros((max_splits))

    for i in range(user_counts.shape[0]):

        for split in splits:#[split_counts.argsort()]:
            if split not in groups.keys():
                groups[split] = {'users': [], 'requests': 0}

            current_group_size = groups[split]['requests']
            remaining_space = max_group_size - current_group_size

            if i == 0:
                groups[split]['users'].append(user_counts.users[i])
                groups[split]['requests'] += user_counts.requests[i]
                groups[split]['user_count'] = len(groups[split]['users'])
                break

            elif (current_group_size < max_group_size) & (remaining_space >= user_counts.requests[i]):
                groups[split]['users'].append(user_counts.users[i])
                groups[split]['requests'] += user_counts.requests[i]
                groups[split]['user_count'] = len(groups[split]['users'])

                #split_counts[split-1] = groups[split]['requests']/len(groups[split]['users'])

                break
    
    negative_groups = groups
    
    datasets = []

    for split in splits:

        # get the lists of positive and negative users
        pos_users = positive_groups[split]['users']
        neg_users = negative_groups[split]['users']

        # indices
        indices = np.arange(len(y))

        # get the indices of pos and neg user groups
        split_positive_indices = indices[np.isin(emails, pos_users)]
        split_negative_indices = indices[np.isin(emails, neg_users)]

        # split the users 
        this_split_indices = np.concatenate((split_positive_indices, split_negative_indices), axis=0)

        # get the rest of the indices
        the_rest_indices =  np.array(list(set(indices) - set(list(this_split_indices))))


        # create the training and test sets for this split
        y_test = y[this_split_indices]
        y_train = y[the_rest_indices]
        X_test = X[this_split_indices,:,:]
        X_train = X[the_rest_indices,:,:]

        # append splits
        datasets.append((X_train, X_test, y_train, y_test))
        
    return datasets

In [None]:
data_splits = balanced_class_group_splits(X,y,groups)

In [None]:
data_splits[0]

In [None]:
fraudsters = groups[y==1]
list(set(fraudsters))

In [None]:
print(X.dtype)
print(X.shape)
print(y.dtype)
print(y.shape)
print(type(y))

In [None]:
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
from matplotlib import pyplot
from keras.optimizers import adam

from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight


compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)

# to keep track of average precision score for each split
ys=[]
preds=[]
probs=[]

# Split the data stratified by user/email address so there's no cross contamination.
# number of splits is 3 because there's only 3 fraudulent users
group_kfold = GroupKFold(n_splits=3)

print('Producing {} splits of the data'.format(group_kfold.get_n_splits(X=X, y=y, groups=groups)))

# cross validation
#for train_index, test_index in group_kfold.split(X=X, y=y, groups=groups):
for X_train, X_test, y_train, y_test in data_splits:

#     X_train = X[train_index,:,:]
#     X_test = X[test_index,:,:]
#     y_train = y[train_index]
#     y_test = y[test_index]
    print(X_train.shape, X_train.dtype)
    print(X_test.shape, X_test.dtype)
    print(y_train.shape, y_train.dtype)
    print(y_test.shape, y_test.dtype)
    print('Train Fraud: ', len(y_train[y_train == 1])/len(y_train))
    #print('Train Fraud Emails: ', list(set(groups[train_index][y_train == 1])))
    print('Test Fraud: ', len(y_test[y_test == 1])/len(y_test))
    #print('Test Fraud Emails: ', list(set(groups[test_index][y_test == 1])), '\n')
    print(list(set(groups[train_index]) & set(groups[test_index]) & set(groups[y == 1])))

    class_weight = dict(enumerate(compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)))
            
    opt = adam(lr=1)

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(20, input_shape=(n_timesteps, n_features), return_sequences=False, go_backwards=True))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=50, class_weight=class_weight)
    pyplot.plot(history.history['loss'])
    pyplot.plot(history.history['val_loss'])
    pyplot.title('model train vs validation loss')
    pyplot.ylabel('loss')
    pyplot.xlabel('epoch')
    pyplot.legend(['train', 'validation'], loc='upper right')
    pyplot.show()
    
    ys.append(y_test)
    preds.append(model.predict(X_test))
    probs.append(model.predict_proba(X_test))



avps = []

for i in range(len(ys)):
    avps.append(average_precision_score(ys[i], preds[i]))
    print(confusion_matrix(ys[i], preds[i]>0.6))
    
np.mean(avps)

In [None]:
model = Sequential()
model.add(LSTM(20, input_shape=(n_timesteps, n_features), return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

history = model.fit(X, y, epochs=10, batch_size=50, class_weight=dict(enumerate(compute_class_weight())))

In [None]:
X = X.astype('float32')



In [None]:
autoencoder = Sequential()
autoencoder.add(LSTM(5, input_shape=(n_timesteps, n_features), return_sequences=True))
autoencoder.add(LSTM(n_features, return_sequences=True))
autoencoder.compile(loss='mean_squared_error', optimizer='RMSprop')
history = autoencoder.fit(X,X, epochs=3)


In [None]:
preds = autoencoder.predict(X)

squared_error = np.square(X - preds)

total_squared_error = np.apply_over_axes(np.sum, a=squared_error, axes=[1,2])
total_squared_error = total_squared_error.reshape((1777))

results = pd.DataFrame({'email': groups, 'anomaly_score': total_squared_error, 'fraud': y})

In [None]:
results.sort_values(by='anomaly_score', ascending=False)

In [None]:
user_results = results.groupby('email', as_index=False)[['anomaly_score','fraud']].agg({'anomaly_score': ['mean','sum', 'median'], 'fraud': ['sum']})

user_results.columns = [col[0] if col[1] == '' else col[0]+'_'+col[1] for col in user_results.columns.ravel()]

user_results = user_results.sort_values(by=['anomaly_score_sum'], ascending=False)

user_results.reset_index(drop=True)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(results.anomaly_score, hist=False, rug=True)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(user_results.anomaly_score_sum, hist=False, rug=True)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(user_results.anomaly_score_mean, hist=False, rug=True)

user_results.sort_values(by='anomaly_score_median', ascending=False).head(10)

In [None]:
from pymongo import MongoClient
from pandas.io.json import json_normalize

# load the database credentials from file
with open('../user_aggregation_pipeline//creds.json') as json_data:
    creds = json.load(json_data)

def get_user_events(user_email):
    
    # set up a database with credentials
    client = MongoClient(creds['connection_string'])

    # get the full history of interac requests
    events = list(client['production']['eventCollection'].find({
            'metadata.email': user_email}))
    
    df = pd.DataFrame(json_normalize(events))
    df = df.sort_values(by='created', ascending=False)
    
    return df

def display_all_user_events(user_email):

    df = get_user_events(user_email)

    with pd.option_context('display.max_rows', 1000):

        display(df[['created','eventCategory','eventAction','eventLabel','metadata.amount','value','metadata.ip']])

display_all_user_events('david9074@gmail.com')

In [None]:
print('--------------------------------------')
print('Total Events = '+str(len(y)))
print('--------------------------------------')
i = 1    
    
for train_index, test_index in group_kfold.split(X=X, y=y, groups=groups):
    print('======================================================================')
    print('SPLIT:',i)
    print('------------')
    i += 1
    
    # Do training set/test set.
    for idx_set in [train_index,test_index]:
        if set(idx_set)==set(train_index):
            print('======================================================================')
            print('TRAINING SET:')
            print('------------')
        else:
            print('--------------------------------------')
            print('TEST SET:')
            print('--------')
        Email = groups[idx_set]
        Fraud = y[idx_set]
        
        df = pd.concat([Email,Fraud],axis=1)
        df.columns=['emails','fraud']
    
        print('Event Count:')
        print( pd.DataFrame( df.groupby(['fraud']).size(), columns=['Count'] ) )
        print(' ')

        q = df[df.fraud==True].groupby('emails').size()
        print('Number unique FRAUD emails: '+str(len(q)))
        print('--------------------------')
        print(q)
    
        q = df[df.fraud==False].groupby('emails').size()
        print('--------------------------------------')
        print('Number unique VALID emails: '+str(len(q)))

In [None]:
fraud = interac_data_dict['labels']
emails = interac_data_dict['users']

type(emails)

In [None]:
fraud_values = interac_data_dict['labels'].values
email_values = interac_data_dict['users'].values
y = 

In [None]:
print('--------------------------------------')
print('Total Events = '+str(len(y)))
print('--------------------------------------')
    
for train_index, test_index in group_kfold.split(X=X, y=y, groups=groups):
    # Do training set/test set.
    for idx_set in [train_index,test_index]:
        if set(idx_set)==set(train_index):
            print('======================================================================')
            print('TRAINING SET:')
            print('------------')
        else:
            print('--------------------------------------')
            print('TEST SET:')
            print('--------')
        Email = emails.iloc[idx_set]
        Fraud = fraud.iloc[idx_set]
        
        df = pd.concat([Email,Fraud],axis=1)
        df.columns=['emails','fraud']
    
        print('Event Count:')
        print( pd.DataFrame( df.groupby(['fraud']).size(), columns=['Count'] ) )
        print(' ')

        q = df[df.fraud==True].groupby('emails').size()
        print('Number unique FRAUD emails: '+str(len(q)))
        print('--------------------------')
        print(q)
    
        q = df[df.fraud==False].groupby('emails').size()
        print('--------------------------------------')
        print('Number unique VALID emails: '+str(len(q)))