In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import time
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import math

# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)
    
client = MongoClient(creds['connection_string'])

In [None]:
summaries = list(client['ml']['requestEvents60Summaries'].find())

whitelist = [item['email'] for item in client['production']['emailWhitelistCollection'].find() if item['level'] == 'ALLOWED']
blacklist = [item['email'] for item in client['production']['emailBlacklistCollection'].find() if item['level'] == 'BLOCKED']

In [None]:
def is_whitelist(email):
    
    if email in whitelist:
        return True
    if any(string in email.lower() for string in ['fingerfood', 'einstein.exchange', 'test', 'testing']):
        return True
    
    return False

def is_blacklist(email):
    
    if email in blacklist:
        return 1
    
    return 0

def drop_whitelist(df):

    return df[df['request.user_email'].apply(is_whitelist) == False]

def label_fraud(df):

    df['fraud'] = df['request.user_email'].apply(is_blacklist)
    
    return df[df['request.user_email'].apply(is_whitelist) == False]

In [None]:
sdf = label_fraud(drop_whitelist(json_normalize(summaries)))
sdf = sdf.fillna(0)
sdf.groupby(['request.user_email','fraud'], as_index=False)._id.count().sort_values('_id', ascending=False)

In [None]:
one_hot = ['request.category_action','request.category_action_label','request.category_label',
           'request.event_action','request.event_category','request.event_label',
           'request.fiat_currency','request.cryptocurrency','request.transaction_type',
           'request.user_city','request.user_country','request.user_province_state_territory']

to_numeric = ['request.created']

as_is = ['request.cryptocurrency_amount','request.fiat_currency_value','request.fiat_rate','request.user_email']

keep_request_columns = sorted(one_hot + to_numeric + as_is)

drop_columns = ['_id'] + [col for col in sdf.columns if 'request.' in col and col not in keep_request_columns]

sdf.drop(drop_columns, axis=1, inplace=True)

sdf = pd.get_dummies(sdf, columns=one_hot)

for col in to_numeric:
    sdf[col] = pd.to_numeric(sdf[col])

for col in sorted(sdf.columns):
    print(col)

In [None]:
sns.distplot(sdf[sdf.fraud == 1].n_events.fillna(0), hist=False, rug=False)

In [None]:
sns.distplot(sdf[sdf.fraud == 0].n_events.fillna(0), hist=False, rug=False)

In [None]:
def split(group_counts, n_splits, tolerance=0):
    
    max_split_size = math.ceil(group_counts['count'].sum()/n_splits)
    
    splits = {}
    
    # initialize the dict
    for split in range(n_splits):
        splits[split] = {'groups': [], 'n_examples': 0}
    
    for i, (group, count) in enumerate(zip(group_counts['group'].values, group_counts['count'].values)):
        
        rand_splits = np.random.choice(np.arange(n_splits), size=n_splits, replace=False)
        
        for split in rand_splits:
            
            new_count = splits[split]['n_examples']+count
            
            # if the current split is empty and the count is less than tolerance over the max_split_size
            if count > max_split_size and splits[split]['n_examples'] == 0 and (count-max_split_size)/max_split_size <= tolerance:
            
                # if new_count > max_split_size and (new_count-max_split_size)/max_split_size <= tolerance:
                splits[split]['groups'].append(group)
                splits[split]['n_examples'] += count
                
                break
            else:
                if (max_split_size-splits[split]['n_examples']) >= count:
                    splits[split]['groups'].append(group)
                    splits[split]['n_examples'] += count
                    break
    
    
    return splits


def balanced_group_n_fold(labels, groups, tolerance=0.1, max_splits=10):
    
    # determine the minority group class
    df = pd.DataFrame({'label': labels, 'group': groups})
    
    # count the number of groups by class
    result = df.groupby('label')['group'].agg(['count']).sort_values(by='count', ascending=True).reset_index()
    display(result)
    
    # select the class with the fewest groups
    minority_group_class = result['label'].values[0]
    majority_group_class = result['label'].values[1]
    
    minority_df = df[df['label'] == minority_group_class]
    majority_df = df[df['label'] == majority_group_class] # should check that there is indeed a second class

    print("Minority Class: ", minority_group_class)
    
    # determine the max number of folds
    # get the group with the most number of examples in the minority class
    minority_group_counts = minority_df.groupby('group', as_index=False)['label'].agg(['count']).reset_index().sort_values(by='count', ascending=False)
    majority_group_counts = majority_df.groupby('group', as_index=False)['label'].agg(['count']).reset_index().sort_values(by='count', ascending=False)

    max_folds = minority_df.shape[0]/minority_group_counts['count'].values[0]
    max_folds = math.ceil(max_folds) if max_folds%1 >= (1-tolerance) else math.floor(max_folds)
    max_folds = min(max_folds, max_splits)
    print("Number of folds is: ",max_folds)
    
    minority_splits = split(minority_group_counts, max_folds, tolerance=tolerance)
    majority_splits = split(majority_group_counts, max_folds, tolerance=tolerance)
    
    combined = {}
    
    # combine the minority and majority groups together
    for key in minority_splits.keys():
        combined[key] = {'groups': minority_splits[key]['groups'] + majority_splits[key]['groups'],
                         'n_examples': minority_splits[key]['n_examples'] + majority_splits[key]['n_examples']}
        
    for key in combined.keys():
        print("Minority Class Fold:", key, ', groups:', len(minority_splits[key]['groups']), ', n_examples:', minority_splits[key]['n_examples'])
        print("Majority Class Fold:", key, ', groups:', len(majority_splits[key]['groups']), ', n_examples:', majority_splits[key]['n_examples'])
        print("Combine Fold:", key, ', groups:', len(combined[key]['groups']), ', n_examples:', combined[key]['n_examples'])
    
    splits = []
    
    # set up the indices
    indices = np.arange(len(labels))
    
    # create the plit indices
    for key in combined.keys():
        splits.append((indices[np.isin(groups, combined[key]['groups']) == False],indices[np.isin(groups, combined[key]['groups'])]))
    
    result = splits
    
    return result

    
folds = balanced_group_n_fold(sdf['fraud'], sdf['request.user_email'])

In [None]:
def crossValidationResults(df, labels, groups, classifier_list):
    
    # variables to keep track of
    training_fraud_percentages = []
    testing_fraud_percentages = []
    training_accuracies = []
    testing_accuracies = []
    training_average_precisions = []
    testing_average_precisions = []
    train_true_negatives = []
    train_false_negatives = []
    train_true_positives = []
    train_false_positives = []
    test_true_negatives = []
    test_false_negatives = []
    test_true_positives = []
    test_false_positives = []
    classifiers = []
    
    X = df.drop([labels,groups], axis=1).astype('float32')
    X[X == -np.inf] = 0
    X[X == np.inf] = 0
    X = X.values
    
    y = df[labels].values
    groups = df[groups].values

    cv_folds = balanced_group_n_fold(y, groups)
    
    # for each classifer type
    for clf in classifier_list:
        
        classifier = clf['Label']
        model = clf['Model']

        for train_index, test_index in cv_folds:

            X_train = X[train_index,:]
            X_test = X[test_index,:]
            y_train = y[train_index]
            y_test = y[test_index]

            class_ratio = np.sum(y_train == 0)/np.sum(y_train == 1)

#             if type(model) == type(XGBClassifier()):
#                 model.set_params(**{'scale_pos_weight': class_ratio})

            model.fit(X_train, y_train)

            preds_train = model.predict(X_train)
            preds_test = model.predict(X_test)
            probs_train = model.predict_proba(X_train)[:,1]
            probs_test = model.predict_proba(X_test)[:,1]

            training_accuracy = np.sum(preds_train == y_train)/len(y_train)
            testing_accuracy = np.sum(preds_test == y_test)/len(y_test)

            training_avp = sklearn.metrics.average_precision_score(y_train, probs_train)
            testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test)

            train_cm = sklearn.metrics.confusion_matrix(y_train, preds_train, sample_weight=None)

            test_cm = sklearn.metrics.confusion_matrix(y_test, preds_test, sample_weight=None)

            training_fraud_percentages.append(np.sum(y_train)/len(y_train))
            testing_fraud_percentages.append(np.sum(y_test)/len(y_test))
            training_accuracies.append(training_accuracy)
            testing_accuracies.append(testing_accuracy)
            training_average_precisions.append(training_avp)
            testing_average_precisions.append(testing_avp)
            train_true_negatives.append(train_cm[0,0])
            train_false_negatives.append(train_cm[1,0])
            train_true_positives.append(train_cm[1,1])
            train_false_positives.append(train_cm[0,1])
            test_true_negatives.append(test_cm[0,0])
            test_false_negatives.append(test_cm[1,0])
            test_true_positives.append(test_cm[1,1])
            test_false_positives.append(test_cm[0,1])
            classifiers.append(classifier)

    
    # put results together in a dataframe
    model_results_df = pd.DataFrame({
        'classifier': classifiers,
        'train_fraud_percentage': training_fraud_percentages,
        'test_fraud_percentage': testing_fraud_percentages,
        'train_accuracy': training_accuracies,
        'test_accuracy': testing_accuracies,
        'train_average_precision': training_average_precisions,
        'test_average_precision': testing_average_precisions,
        'train_true_negatives': train_true_negatives,
        'train_false_negatives': train_false_negatives,
        'train_true_positives': train_true_positives,
        'train_false_positives': train_false_positives,
        'test_true_negatives': test_true_negatives,
        'test_false_negatives': test_false_negatives,
        'test_true_positives': test_true_positives,
        'test_false_positives': test_false_positives,
    })     

    # calculate precision and recall
    model_results_df['test_precision'] = model_results_df.test_true_positives/(model_results_df.test_true_positives+model_results_df.test_false_positives)
    model_results_df['test_recall'] = model_results_df.test_true_positives/(model_results_df.test_true_positives+model_results_df.test_false_negatives)
    model_results_df['train_precision'] = model_results_df.train_true_positives/(model_results_df.train_true_positives+model_results_df.train_false_positives)
    model_results_df['train_recall'] = model_results_df.train_true_positives/(model_results_df.train_true_positives+model_results_df.train_false_negatives)
    model_results_df.fillna(0, inplace=True)

    return model_results_df[sorted(model_results_df.columns)]

In [None]:
import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#from xgboost import XGBClassifier

classifiers = [
    {'Label': 'Random Forest', 'Model': RandomForestClassifier()},
    {'Label': 'Modified Random Forest', 'Model': RandomForestClassifier(n_estimators=100, max_depth=5, max_features=int(sdf.shape[1]/100), class_weight='balanced')},
    {'Label': 'Dummy', 'Model': DummyClassifier()}
]

model_results = crossValidationResults(df=sdf, labels='fraud', groups='request.user_email', classifier_list=classifiers)
        
summary = model_results.groupby(['classifier'], as_index=False)[['test_precision','test_recall','test_average_precision']].agg(['mean','median','std']).reset_index()
summary.columns = [col[0] if col[1] == '' else '_'.join(col) for col in summary.columns.ravel()]
summary.sort_values(by='test_average_precision_mean', ascending=False)

In [None]:
for col in sdf.columns: print(col)

In [None]:
classifiers = [
    {'Label': 'Random Forest', 'Model': RandomForestClassifier()},
    {'Label': 'Modified Random Forest', 'Model': RandomForestClassifier(n_estimators=100, max_depth=5, max_features=int(sdf.shape[1]/100), class_weight='balanced')},
    {'Label': 'Dummy', 'Model': DummyClassifier()}
]

model_results = crossValidationResults(df=sdf., labels='fraud', groups='request.user_email', classifier_list=classifiers)
        
summary = model_results.groupby(['classifier'], as_index=False)[['test_precision','test_recall','test_average_precision']].agg(['mean','median','std']).reset_index()
summary.columns = [col[0] if col[1] == '' else '_'.join(col) for col in summary.columns.ravel()]
summary.sort_values(by='test_average_precision_mean', ascending=False)

In [None]:
data = sdf[sdf['request.transaction_type_interac'] == True]

classifiers = [
    {'Label': 'Random Forest', 'Model': RandomForestClassifier()},
    {'Label': 'Modified Random Forest', 'Model': RandomForestClassifier(n_estimators=100, max_depth=5, max_features=int(data.shape[1]/100), class_weight='balanced')},
    {'Label': 'Dummy', 'Model': DummyClassifier()}
]

model_results = crossValidationResults(df=data, labels='fraud', groups='request.user_email', classifier_list=classifiers)
        
summary = model_results.groupby(['classifier'], as_index=False)[['test_precision','test_recall','test_average_precision']].agg(['mean','median','std']).reset_index()
summary.columns = [col[0] if col[1] == '' else '_'.join(col) for col in summary.columns.ravel()]
summary.sort_values(by='test_average_precision_mean', ascending=False)

In [None]:
data = sdf[sdf['request.transaction_type_credit card'] == True]

classifiers = [
    {'Label': 'Random Forest', 'Model': RandomForestClassifier()},
    {'Label': 'Modified Random Forest', 'Model': RandomForestClassifier(n_estimators=100, max_depth=5, max_features=int(data.shape[1]/100), class_weight='balanced')},
    {'Label': 'Dummy', 'Model': DummyClassifier()}
]

model_results = crossValidationResults(df=data, labels='fraud', groups='request.user_email', classifier_list=classifiers)
        
summary = model_results.groupby(['classifier'], as_index=False)[['test_precision','test_recall','test_average_precision']].agg(['mean','median','std']).reset_index()
summary.columns = [col[0] if col[1] == '' else '_'.join(col) for col in summary.columns.ravel()]
summary.sort_values(by='test_average_precision_mean', ascending=False)