In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import time
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import math

from einsteinds import plots
from einsteinds import db as edb
from einsteinds import event_processing
from einsteinds import utils

import featuretools as ft
from featuretools.primitives import NUnique

clean_events = event_processing.clean_events

# load the database credentials from file
with open('../creds/local_creds.json') as json_data:
    creds = json.load(json_data)
    
client = MongoClient(creds['connection_string'])

# initialize the database with the credentials
db = edb.Database(creds)

In [None]:
# get the request sets from the database
rsets = db.get_deposit_request_sets(start_date=datetime.datetime(2018,1,1), end_date=datetime.datetime(2018,3,1))

In [None]:
# separate out the requests
requests = [event_processing.clean_event(rset['request']) for rset in rsets]

In [None]:
# separate out the events from the requests
events = [{
    'request_id': event_processing.clean_event(rs['request'])['_id'], 
    'event': event_processing.clean_event(event)
    } for rs in rsets for event in rs['events']]

In [None]:
# create a dataframe with the JSON events
edf = json_normalize(events)
edf['event._id'] = edf['event._id'].apply(lambda x: str(x)) # convert the bson to string
edf['request_id'] = edf['request_id'].apply(lambda x: str(x)) # convert the bson to string
edf['_id'] = edf['request_id']+'_'+edf['event._id'] # create a unique id for the

one_hot_columns = ['category_action_label', 'category_action', 'category_label', 'event_category', 'event_action', 'event_label', 'event.trade_instrument']
one_hot_columns = [col for col in one_hot_columns if col in edf.columns]

old_cols = edf[['_id']+one_hot_columns]

edf = pd.get_dummies(edf, columns=one_hot_columns).merge(old_cols, left_on='_id', right_on='_id')

for col in ['event.card_expiry_month', 'event.card_expiry_year', 'event.card_last_digits']: # convert these numerical columns to string
    if col in edf.columns:
        edf[col] = edf[col].apply(lambda x: col+':'+str(x))

In [None]:
# create a dataframe with the JSON events
rdf = json_normalize(requests)
rdf['_id'] = rdf['_id'].apply(lambda x: str(x)) # convert the bson to string
rdf['month'] = rdf.created.apply(lambda x: datetime.datetime(x.year, x.month, 1))

In [None]:
for col in edf.columns: print(col)

In [None]:
# make a dataframe with the cryptocurrencies - this could be extended later with more information about each currency
udf = pd.DataFrame({'user_email': sorted(rdf['user_email'].dropna().unique())})

mdf = pd.DataFrame({'month': sorted(rdf['month'].dropna().unique())})

# make a dataframe with the cryptocurrencies - this could be extended later with more information about each currency
cdf = pd.DataFrame({'cryptocurrency': sorted(edf['event.cryptocurrency'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
cal_df = pd.DataFrame({'category_action_label': sorted(edf['event.category_action_label'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
ca_df = pd.DataFrame({'category_action': sorted(edf['event.category_action'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
cl_df = pd.DataFrame({'category_label': sorted(edf['event.category_label'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
ec_df = pd.DataFrame({'event_category': sorted(edf['event.event_category'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
ea_df = pd.DataFrame({'event_action': sorted(edf['event.event_action'].dropna().unique())})

# create a dataframe with the event types which are a combination of the 
el_df = pd.DataFrame({'event_label': sorted(edf['event.event_label'].dropna().unique())})

for df in [udf, mdf, cdf,cal_df,ca_df,cl_df,ec_df,ea_df,el_df]:
    display(df.head())

In [None]:
entities = {
    "users": (udf, 'user_email'),
    "months": (mdf, 'month'),
    "requests": (rdf, "_id", "created"),
    "events": (edf, "_id", "event.created"),
    "cryptocurrencies": (cdf, "cryptocurrency"),
    "category_action_labels": (cal_df, 'category_action_label'),
    "category_actions": (ca_df, 'category_action'),
    "category_labels": (cl_df, 'category_label'),
    "event_categories": (ec_df, 'event_category'),
    "event_actions": (ea_df, 'event_action'),
    "event_labels": (el_df, 'event_label')
}

relationships = [("users", "user_email", "requests", "user_email"),
                 ("months", "month", "requests", "month"),
                 ("requests", "_id", "events", "request_id"),  
                 ("cryptocurrencies", "cryptocurrency", "events", "event.cryptocurrency"),
                 ("category_action_labels", "category_action_label", "events", "event.category_action_label"),
                 ("category_actions", "category_action", "events", "event.category_action"),
                 ("category_labels", "category_label", "events", "event.category_label"),
                 ("event_categories", "event_category", "events", "event.event_category"),
                 ("event_actions", "event_action", "events", "event.event_action"),
                 ("event_labels", "event_label", "events", "event.event_label")]


request_events_es = ft.EntitySet("request_events", entities, relationships)

feature_matrix, feature_defs = ft.dfs(entityset=request_events_es,
                                      target_entity="users",
                                      agg_primitives=["count","mean","median","max","min",'std','skew',"avg_time_between","sum",NUnique],
                                      trans_primitives=["hour","time_since_previous"],
                                      max_depth=3)

In [None]:
entities = {
    "requests": (rdf, "_id", "created"),
    "events": (edf, "_id", "event.created"),
    "cryptocurrencies": (cdf, "cryptocurrency"),
    "category_action_labels": (cal_df, 'category_action_label'),
    "category_actions": (ca_df, 'category_action'),
    "category_labels": (cl_df, 'category_label'),
    "event_categories": (ec_df, 'event_category'),
    "event_actions": (ea_df, 'event_action'),
    "event_labels": (el_df, 'event_label')
}

relationships = [("requests", "_id", "events", "request_id"),  
                 ("cryptocurrencies", "cryptocurrency", "events", "event.cryptocurrency"),
                 ("category_action_labels", "category_action_label", "events", "event.category_action_label"),
                 ("category_actions", "category_action", "events", "event.category_action"),
                 ("category_labels", "category_label", "events", "event.category_label"),
                 ("event_categories", "event_category", "events", "event.event_category"),
                 ("event_actions", "event_action", "events", "event.event_action"),
                 ("event_labels", "event_label", "events", "event.event_label")]


request_events_es = ft.EntitySet("request_events", entities, relationships)

feature_matrix, feature_defs = ft.dfs(entityset=request_events_es,
                                      target_entity="requests",
                                      agg_primitives=["count","mean","median","max","min",'std','skew',"avg_time_between","sum",NUnique],
                                      trans_primitives=["hour","time_since_previous"],
                                      max_depth=3)

In [None]:
requests_with_fraud = db.add_fraud_label(rdf, 'user_email')

requests_with_fraud['day'] = requests_with_fraud.created.apply(lambda x: datetime.datetime(x.year, x.month, x.day))

In [None]:
tsne = TSNE()

tsne_results = tsne.fit_transform(tsne_data)

projection = results[['fraud','user_email']]
projection['x'] = tsne_results[:,0]
projection['y'] = tsne_results[:,1]

data = [go.Scatter(x=projection[projection.fraud == False].x, 
                   y=projection[projection.fraud == False].y, 
                   name="Not Fraud", 
                   mode = 'markers',
                   text = projection[projection.fraud == False].user_email)]
data.append(go.Scatter(x=projection[projection.fraud == True].x, 
                       y=projection[projection.fraud == True].y, 
                       name="Fraud", 
                       mode = 'markers',
                   text = projection[projection.fraud == True].user_email))

iplot(data)