# Load and Extract Data

## Initialize

In [None]:
# for numerical workimport pandas as pd
import numpy as np

import pymongo

import datetime
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

In [None]:
# Use full width of browser.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Load database from Mongo

In [None]:
# load the database credentials from file

# SET APPROPRIATE PATH FOR THE 'creds.json' FILE.

with open('../../../data-science-poc/user_aggregation_pipeline/creds.json') as json_data:
    creds = json.load(json_data)

In [None]:
client = MongoClient(creds['connection_string'])

In [None]:
# NOTE: Loading takes about 12 minutes.
# If it seems to be taking too long make sure you're not on guest wifi. 
# Reset the wifi on the computer seems to fix the problem, too.

In [None]:
# Load using list comprehension.

# request_event_sets = [{'request': rs['request'],'event': event} for rs in client['ml']['requestEvents60'].find() for event in rs['events']]

In [None]:
# Or load using explicit for loops and progress reporting.

request_event_sets = []
i = 0 # There's about 650,000 entries at present. Will print a count every 50,000.

for rs in client['ml']['requestEvents60'].find():
    #print(rs)
    for event in rs['events']:
        tmp = {'request': rs['request'],'event': event}
        #print(tmp)
        request_event_sets.append( tmp )
        i +=1
        if i % 50000 == 0:
            print(i)
#         if i == 10000: # To limit to only first N entries instead.
#             raise StopIteration

In [None]:
# Convert to data frame.

df = json_normalize(request_event_sets)

## Append fraud variable to df and extract interac/buy (credit card) records.

In [None]:
def flagFraudsters(df):

    # Extract the black list eamils.
    # NOTE: This includes credit card emails that have been blocked. Not just interac.
    blemails = list(pd.DataFrame(json_normalize(list(client['production']['emailBlacklistCollection'].find({'level': 'BLOCKED'})))).email) + ['gaelkevin@hotmail.com', 'royer.8383@gmail.com','adventurous7381@gmail.com']
    
    df['fraud'] = df['event.metadata.email'].isin(blemails).astype(int)
    df['fraud'] = df['request.metadata.email'].isin(blemails).astype(int)
    
    return df

def removeWhitelistRecords(df):

    # Intended for testing for the most part. Generally fingerfood accounts.
    wlemails = pd.DataFrame(json_normalize(list(client['production']['emailWhitelistCollection'].find({'level': 'ALLOWED'})))).email
    
    df = df[df['event.metadata.email'].str.contains('test') == False]
    df = df[df['event.metadata.email'].str.contains('fingerfoodstudios') == False]
    df = df[df['event.metadata.email'].str.contains('einstein.exchange') == False]
    df = df[df['event.metadata.email'].isin(wlemails) == False]

    df = df[df['request.metadata.email'].str.contains('test') == False]
    df = df[df['request.metadata.email'].str.contains('fingerfoodstudios') == False]
    df = df[df['request.metadata.email'].str.contains('einstein.exchange') == False]
    df = df[df['request.metadata.email'].isin(wlemails) == False]
    
    return df 

# flag the fraudulnet records and remove the whitelist and test accounts
df = removeWhitelistRecords(flagFraudsters(df))

## Extract either INTERAC or CREDIT CARD events.

In [None]:
request_category = 'buy' # Set to 'buy' (credit card) or 'interac' for Interac.

if request_category == 'interac':
    events_df = df[
    (df['request.eventCategory'] == 'interac') &
    (df['request.eventAction'] == 'request') # This is redundant right now. Here for completeness.
    ].copy()

elif request_category == 'buy':
    events_df = df[
    (df['request.eventCategory'] == 'buy') &
    (df['request.eventAction'] == 'request') # This is redundant right now. Here for completeness.
    ].copy()

# Cleaning and Transform Pipelines

## Concatenate Lists

In [None]:
# Some columns are lists of strings.
# For now just concatenate these into a single string and use as categorical variables.
# This seemed a reasonable work around until the columns can be looked at further.

# NOTE: The column cleaning pipeline assumes this operation has taken place in some of the column transformers.
# The concatenated columns are returned in 'list_columns' if desired.

def concatenate_lists_to_string( df ):

    list_columns = []

    for c in list( df.columns ):
        try:
            df[c].nunique() # This fails on list entries.
        except TypeError:   # So on fail convert the list of strings to a single string.
            list_columns.append(c)    
            df[c] = df[c].apply( lambda x: str(x) ).replace(['nan'],np.nan)
            
    return df

In [None]:
events_df = concatenate_lists_to_string( events_df )

## Cleaning Pipe

In [None]:
from define_cleaning_map import map_clean

In [None]:
clean_events_df = map_clean.fit_transform(events_df)

In [None]:
# Restore the 'event/request.created' columns to the original datetime format. This fixes the accidental conversion to ns.
# For some reason this doesn't happen when we scale the float columns in the scaling that follows. The datetime columns pass through correctly.

clean_events_df['event.created'] = pd.to_datetime( events_df['event.created'] )
clean_events_df['request.created'] = pd.to_datetime( events_df['request.created'] )

In [None]:
# Do not need 'event._id'. NEED TO UPDATE THE CLEANING FILE TYPES .PY FILE TO DO THIS AUTOMATICALLY.

clean_events_df.drop('event._id', axis=1, inplace=True) 

## Extract Null/Non-Null Columns

In [None]:
clean_null_cols = []

tmp_df = clean_events_df[clean_events_df['request.eventCategory'] == 'buy']

for c in tmp_df.columns:
    if tmp_df[c].isnull().all():
        #print(c)
        clean_null_cols.append(c)

len(clean_null_cols)

In [None]:
all_cols = clean_events_df.columns

In [None]:
clean_nonnull_cols = list( set(all_cols) - set(clean_null_cols) )
len( clean_nonnull_cols )

In [None]:
clean_events_df = clean_events_df[ clean_nonnull_cols ]

## Do Scaling Transformation

In [None]:
from define_scaling_map import map_scale

In [None]:
scale_events_df = map_scale.fit_transform(clean_events_df)

# Run Feature Tools.

### Force data types for Feature Tools to recognize automatically.

In [None]:
from define_column_variable_types import (
    categorical_cols,
    datetime_cols,
    float_cols,
    id_cols,
    target_col,
    request_cols
)

In [None]:
for c in categorical_cols:
    if c in scale_events_df.columns:
        scale_events_df[c] = scale_events_df[c].astype('category')

In [None]:
for c in float_cols:
    if c in scale_events_df.columns:
        scale_events_df[c] = scale_events_df[c].astype('float')

In [None]:
for c in id_cols:
    if c in scale_events_df.columns:
        scale_events_df[c] = scale_events_df[c].astype('category')

In [None]:
scale_events_df['fraud'] = pd.to_numeric( scale_events_df['fraud'] )

In [None]:
request_keep_cols = []
for c in request_cols:
    if c in scale_events_df.columns and c != 'request._id': # Keep 'request.id' out of the columns to keep. It will be inherited as an index when 'requests' es is normalized from the 'events' es.
        request_keep_cols.append(c)
        
request_keep_cols.append('fraud')

### Now run Feature Tools.

In [None]:
import featuretools as ft

In [None]:
es = ft.EntitySet('user_events')

In [None]:
es.entity_from_dataframe(entity_id='events',
                        index='Event_ID',  # NOTE: Because of the hour back-window used 'event._id' is no longer unique. Can be assigned to multiple requests.
                        make_index=True,
                        time_index='event.created',
                        dataframe=scale_events_df
#                       ,variable_types={} # Don't need. Take variable types from the data frame.
                        )

In [None]:
es.normalize_entity(new_entity_id='requests',
                   base_entity_id='events',
                   index='request._id',
                   time_index_reduce='last',
                    # Keep 'fraud' for target variable, '...email' for grouping in CV.
                   additional_variables=request_keep_cols) 

In [None]:
feature_matrix, features = ft.dfs(entityset=es
                                  ,target_entity="requests",
                                  agg_primitives=['sum', 'std', 'max', 'skew', 'min', 'mean', 'count', 'percent_true', 'num_unique', 'mode', 'time_since_last','avg_time_between']
                                  #trans_primitives=['day'], # Use default.
                                  ,ignore_variables={'requests': ['request.metadata.email']} # This will be used for grouping. Don't aggregate for a feature.
                                  ,features_only=False
                                  ,max_depth=3  # In this setup of es.events -> es.requests we go no more than three steps in the recursion anyway.
                                  ,max_features=-1
                                 ,verbose=2
                                 )

In [None]:
feature_matrix

In [None]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, features,
                                                     top_n=10,
                                                     include_unknown=True, # 
                                                     verbose=True,
                                                     to_encode=list(set(feature_matrix.columns).difference('request.metadata.email'))
                                                     )

In [None]:
# Save features and feature matrix if required.

# ft.save_features( features_enc, "feature_enc-definitions")

# feature_matrix_enc.to_pickle('feature_matrix_enc.pkl')

# !ls -al *.pkl

In [None]:
# To load use this.

# feature_matrix_enc = pd.read_pickle('feature_matrix_enc.pkl')

# Train Classifiers on Fraud Requests.

## Initialize

### Import

In [None]:
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
import sklearn

from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

%matplotlib inline
from matplotlib import pyplot

### Set up model.

In [None]:
# Replace the NaN's with zeros for now.

feature_matrix_enc['AVG_TIME_BETWEEN(events.event.created)'].fillna(0, inplace=True)

In [None]:
# Feature matrix.
X = feature_matrix_enc.drop(['fraud'], axis=1).values

In [None]:
# Labels are fraud
y = feature_matrix_enc['fraud'].astype(int)

In [None]:
groups = es['requests'].df['request.metadata.email']

In [None]:
# Split the data stratified by request so there's no cross contamination.
# This models the production set up where fraud is detected on a request level basis.

n_splits = 5

group_kfold = GroupKFold(n_splits=n_splits)


In [None]:
#mdl = LogisticRegression(class_weight='balanced', penalty='l1', tol = 1e-4, C=1e-3, solver='saga')

mdl = RandomForestClassifier(class_weight='balanced')


## Run Regression Over the CV Splits

In [None]:
average_precisions = []
i=0

X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=10) # This doesn't seem to actually shuffle or randomize anything. Need to check.

for train_index, test_index in group_kfold.split(X_shuffled, y_shuffled, groups_shuffled):
    
    i += 1
    
    X_train = X_shuffled[train_index,:]
    X_test = X_shuffled[test_index,:]
    y_train = y_shuffled[train_index]
    y_test = y_shuffled[test_index]

    mdl.fit(X_train, y_train)

    preds_train = mdl.predict(X_train)
    preds_test = mdl.predict(X_test)
    probs_train = mdl.predict_proba(X_train)[:,1]
    probs_test = mdl.predict_proba(X_test)[:,1]
    
#     training_accuracy = np.sum(preds_train == y_train)/len(y_train)
#     testing_accuracy = np.sum(preds_test == y_test)/len(y_test)
#     print("Training Accuracy:", training_accuracy)
#     print("Testing Accuracy:", testing_accuracy)
    
    training_avp = sklearn.metrics.average_precision_score(y_train, probs_train, average='weighted')
    testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test, average='weighted')
    print("Training Average Precision:", training_avp)
    print("Testing Average Precision:", testing_avp)
    
#     CM_train = sklearn.metrics.confusion_matrix(y_train, preds_train, sample_weight=None)
#     print(CM_train)
#     CM_test = sklearn.metrics.confusion_matrix(y_test, preds_test, sample_weight=None)
#     print(CM_test)

    average_precision = average_precision_score(y_true=y_test, y_score=probs_test, average='weighted')
    average_precisions.append(average_precision)
    
    precision, recall, _ = precision_recall_curve(y_test, probs_test)
    
    pyplot.step(recall, precision, color='b', alpha=0.2,
             where='post')
    pyplot.fill_between(recall, precision, step='post', alpha=0.2,
                     color='b')
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    pyplot.ylim([0.0, 1.05])
    pyplot.xlim([0.0, 1.0])
    pyplot.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
        average_precision))
    pyplot.show()

print('==============================================')
print( "Mean Average Precision Score: {0:.2f}".format(np.mean(average_precisions)) )
print('==============================================')

# Evaluate Features

## FOR RANDOM FOREST --> Report Random Forest Feature Importance

In [None]:
importances = mdl.feature_importances_
std = np.std([tree.feature_importances_ for tree in mdl.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
# Plot the feature importances of the forest
plt.figure()
plt.figure(figsize=(30,12))
plt.title("Feature importances")

# plt.bar(range(X.shape[1]), importances[indices],
#        color="r")#, yerr=std[indices], align="center")
# plt.xticks(range(X.shape[1]), indices)
# plt.xlim([-1, X.shape[1]])

n = 20

plt.bar(range(n), importances[indices[:n]],
       color="r", yerr=std[indices[:n]], align="center")
plt.xticks(range(n), indices[:n])
plt.xlim([-1, n])

plt.show()

In [None]:
feature_list = list( feature_matrix_enc.drop(['fraud'], axis=1).columns )

feature_list[indices[0:n]]

## FOR LASSO LOGISTIC REGRESSION --> Perform Feature Stability Selection

NOTE: This runs independently of the above one shot logistic regression code.
This is more computationally intensive.

In [None]:
from stability_selection import StabilitySelection, plot_stability_path

In [None]:
base_estimator = Pipeline([
        ('scaler', StandardScaler()),  # GENERALLY YOU STANDARDIZE COLUMNS FOR LINEAR REGRESSION. TRY WITHOUT?
        ('model', LogisticRegression(penalty='l1'))
    ])

In [None]:
# NOTE: As is, this does not respect the email groups or the imbalanced fraud/OK ratio (~8%).
# This is not cross-validation, however. It's multiple bootstrap sample of N/2 samples (nominally 100 times).
# The 8% is probably large enough that the N/2 samples will be close enough to the minority/majority class ration.

# The credit card data takes ~15-20 minutes for 10 lambda values.
# This gives a sketch of the stability curves and lets you peg the lambda min/max values.
# 50 lambdas give acceptable curves and take ~1 hour and a bit.
    
selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C',
                              lambda_grid=np.logspace(-7, -2, 50),
                              verbose=2,
                              n_jobs=4) # NOTE: This _does_ take advantage of multi-core and it makes a difference. My Mac has 4 cores.

In [None]:
selector.fit(X, y)

In [None]:
thresh = .95

fig, ax = plot_stability_path(selector, threshold_highlight=thresh )
fig.show()
#plt.figure(figsize=(30,12))
selected_variables = selector.get_support(indices=True, threshold=thresh)
selected_scores = selector.stability_scores_.max(axis=1)

print('Selected variables are:')
print('-----------------------')

for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])):
    print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))

In [None]:
selected_variables

In [None]:
for i in list(selected_variables):
    print( i, ':   ', feature_list[i])