In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import time
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import math

import event_processing

clean_events = event_processing.clean_events

# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)
    
client = MongoClient(creds['connection_string'])

In [None]:
def plot_horizontal_bar(label, value, df):

    data = [go.Bar(
        x=df[value].values,
        y=df[label].values,
        orientation='h')]

    layout = go.Layout(
        title='{} by {}'.format(value, label),
        yaxis={'automargin': True})

    fig = go.Figure(data, layout)

    iplot(fig)

In [None]:
blacklist = [item['email'] for item in client['production']['emailBlacklistCollection'].find() if item['level'] == 'BLOCKED']

In [None]:
blacklist_events = {}

In [None]:
for user in blacklist:
    blacklist_events[user] = list(client['production']['eventCollection'].find({'metadata.email': user}))

In [None]:
for user in blacklist:
    blacklist_events[user] = clean_events(blacklist_events[user])

In [None]:
events_list = []

for u in blacklist_events.keys():
    
    events_list += blacklist_events[u]

In [None]:
edf = json_normalize(events_list)

In [None]:
for col in edf.columns: print(col)

In [None]:
def horizontal_categorical_barplot(group, value, df, agg_func):
    
    grouped = df.groupby(group, as_index=False).agg({value: agg_func}).sort_values(value, ascending=True).reset_index(drop=True)

    plot_horizontal_bar(group, value, grouped)
    

def expand_datetime(time_column, df):
    
    df[time_column+'_year'] = df[time_column].apply(lambda time: time.year)
    df[time_column+'_month'] = df[time_column].apply(lambda time: time.month)
    df[time_column+'_day'] = df[time_column].apply(lambda time: time.day)
    df[time_column+'_hour'] = df[time_column].apply(lambda time: time.hour)
    df[time_column+'_weekday'] = df[time_column].apply(lambda time: time.weekday())
    
    return df

In [None]:
horizontal_categorical_barplot(group='user_email', value='card_last_digits', df=edf, agg_func=pd.Series.nunique)

In [None]:
horizontal_categorical_barplot(group='user_email', value='billing_street', df=edf, agg_func=pd.Series.nunique)

In [None]:
edf = expand_datetime('created', edf)

In [None]:
edf[[col for col in edf.columns if 'created' in col]]

In [None]:
import featuretools

In [None]:
import os, json
import shelve
from bson import json_util
import time

events_filepath  = '../data/events.json'
clean_events_filepath = '../data/clean_events.json'
latest_event = '../data/latest_event.json'


def append_events(events, filepath):
    
    events_string = ', '.join(['{}'.format(json_util.dumps(event)) for event in events])
    
    if os.path.isfile(filepath):
    
        with open (filepath, mode="r+") as file:
            file.seek(os.stat(filepath).st_size -1)
            file.write(",{}]".format(events_string))
            
    else:
        
        with open (filepath, mode="w+") as file:
            #file.seek(os.stat(filepath).st_size -1)
            file.write("[{}]".format(events_string))

            
def update_latest_record(event, filepath):
    
    
    latest_json = {
        'latest_event_time': event['created'],
        'latest_event_id': event['_id']
    }
    
    latest_json = json_util.loads(json_util.dumps(latest_json))
    
    
    if os.path.isfile(filepath):
        
        with open (filepath, mode="r+") as file:
            previous_json = json_util.loads(file.read())
            
            latest_json = {
                'latest_event_time': none_max(json_util.loads(json_util.dumps(event['created'])), previous_json['latest_event_time']),
                'latest_event_id': none_max(event['_id'],previous_json['latest_event_id'])
            }
            
            
            file.seek(0)
            file.write(json_util.dumps(latest_json))
            file.truncate()
            file.close()
            
    else:
    
        with open (filepath, mode="w+") as file:
            
            file.seek(0)
            file.write(json_util.dumps(latest_json))
            file.truncate()
            file.close()

            
def get_latest_event_info(filepath):
    
    if os.path.isfile(filepath):
        
        with open (filepath, mode="r+") as file:
            previous_json = json_util.loads(file.read())
            
        return previous_json['latest_event_id'], previous_json['latest_event_time']
            
    else:
        
        return None, None
        

def none_max(a,b):
    
    if a == None and b == None:
        return None
    
    if a == None:
        return b
    
    if b == None:
        return a
    
    return max(a,b)



start = time.time()

processed = 0
latest_event_id, latest_event_time = get_latest_event_info(latest_event)

                       
def process_events(events):

    # clean_event = event_processing.clean_event(event)

    append_events(events, events_filepath)
    append_events([event_processing.clean_event(event) for event in events], clean_events_filepath)
    update_latest_record(events[-1], latest_event)
    global latest_event_id
    latest_event_id = none_max(latest_event_id, events[-1]['_id'])

    global latest_event_time
    latest_event_time = none_max(latest_event_time, json_util.loads(json_util.dumps(events[-1]['created'])))

    global processed
    processed += len(events)
    
    elapsed = time.time() - start
        
    print('{} events processed in {} seconds or {} events per second'.format(processed, elapsed, processed/elapsed))

                       
has_events = True
batch_size = 1000
                       
while has_events:
                  
    if latest_event_id == None:
        print("Starting from beginning of collection.")
        events = list(client['production']['eventCollection'].find().limit(batch_size))

    else:
        print("Starting from event id {}".format(latest_event_id))
        events = list(client['production']['eventCollection'].find({'_id': {'$gt': latest_event_id}}).limit(batch_size))

    if len(events) == 0:
        has_events = False
        print('Done')
    else:
        process_events(events)
    

In [None]:
with open('../data/clean_events.json') as json_data:
    
    clean_events = json_data.read()
    
    clean_events = clean_events.replace('}{', '},{').replace(',,',',')
    
    clean_events = json_util.loads(clean_events)

In [None]:
clean_events = pd.DataFrame(clean_events)

In [None]:
clean_events = expand_datetime('created', clean_events)

In [None]:
for col in sorted(clean_events.columns): print(col)

In [None]:
clean_events['day'] = clean_events.apply(lambda row: datetime.datetime(row['created_year'],row['created_month'],row['created_day']), axis=1)

In [None]:
def agg_by_day(day_col, group, value, agg_func, df):

    grouped = df.groupby([group,day_col])[value].aggregate({value: agg_func}).reset_index()
    
    return grouped

cal = agg_by_day('day', 'category_action_label','_id', 'count', df= clean_events)
ca = agg_by_day('day', 'category_action','_id', 'count', df= clean_events)
cl = agg_by_day('day', 'category_label','_id', 'count', df= clean_events)

In [None]:
def plot_time_series(df, time, category, value):
    
    data = [go.Scatter(
        x=df[df[category] == cat][time],
        y=df[df[category] == cat][value],
        name = cat) for cat in sorted(df[category].unique())]
    
    plot(data)
    
plot_time_series(df=cal, time='day', category='category_action_label', value='_id')

In [None]:
plot(data)

In [None]:
trades = clean_events[(clean_events.event_category == 'trade') & (clean_events.trade_result.isin(['accepted','rejected']))].dropna(axis=1, how='all')

trades['fiat_currency_rate'] = trades.fiat_currency_value / trades.cryptocurrency_amount

In [None]:
zec = trades[['created','fiat_currency_rate','fiat_currency_value','cryptocurrency','cryptocurrency_amount','trade_latest_price','category_action_label','trade_result','user_email']]

pd.set_option('display.max_rows', 1000)
display(zec[zec.user_email == 'mike@4am.ca'])

In [None]:
data = [go.Scatter(x=zec[zec.category_action_label == category].created, 
            y=zec[zec.category_action_label == category].fiat_currency_rate, 
            name=category,
            text=zec[zec.category_action_label == category].user_email) for category in sorted(zec.category_action_label.unique())] 

iplot(data)

In [None]:
all_trades = clean_events[(clean_events.event_category == 'trade') & 
                          (clean_events.trade_result.isin(['accepted'])) &
                          (clean_events.category_action_label.isin(['trade_place-market-order_sell','trade_place-market-order_buy']))].dropna(axis=1, how='all')

all_trades['fiat_currency_rate'] = all_trades.fiat_currency_value / all_trades.cryptocurrency_amount

In [None]:
for col in all_trades.columns: print(col)

In [None]:
user_trades = all_trades[['created','category_action_label','user_email','cryptocurrency_amount','cryptocurrency','trade_latest_price']].sort_values(['user_email','cryptocurrency','created']).reset_index(drop=True)

In [None]:
user_trades['previous_action'] = user_trades.groupby(['user_email', 'cryptocurrency'])['category_action_label'].shift(1)
user_trades['previous_price'] = user_trades.groupby(['user_email', 'cryptocurrency'])['trade_latest_price'].shift(1)
user_trades['previous_time'] = user_trades.groupby(['user_email', 'cryptocurrency'])['created'].shift(1)

In [None]:
user_trades = user_trades.sort_values('created').reset_index(drop=True)

In [None]:
sell_high_buy_low = user_trades[(user_trades.category_action_label == 'trade_place-market-order_buy') & 
            (user_trades.previous_action == 'trade_place-market-order_sell') &
            (user_trades.trade_latest_price < user_trades.previous_price)]

sell_high_buy_low['reduction'] = (sell_high_buy_low['trade_latest_price'] - sell_high_buy_low['previous_price'])/sell_high_buy_low['previous_price']

sell_high_buy_low.created = pd.to_datetime([x.astype(datetime.datetime) for x in sell_high_buy_low.created.values])
sell_high_buy_low.previous_time = pd.to_datetime([x.astype(datetime.datetime) for x in sell_high_buy_low.previous_time.values])

#sell_high_buy_low['time_diff'] = sell_high_buy_low.created - sell_high_buy_low.previous_time

sell_high_buy_low['time_difference'] = (sell_high_buy_low.created - sell_high_buy_low.previous_time).apply(lambda x: x.total_seconds())

sell_high_buy_low = sell_high_buy_low[(sell_high_buy_low.reduction < -0.10) & (sell_high_buy_low.time_difference < 60*60)]

sell_high_buy_low = sell_high_buy_low[sell_high_buy_low.user_email == 'mike@4am.ca']

sell_high_buy_low

In [None]:
trades = clean_events[(clean_events.event_category == 'trade') & (clean_events.trade_latest_price.isnull() == False)].dropna(axis=1, how='all')

In [None]:
trades = trades[['created','category_action_label','cryptocurrency','trade_latest_price']]
trades['type'] = trades['cryptocurrency']#+'_'+trades['category_action_label']

trades = trades.sort_values('created')

In [None]:
data = [go.Scatter(
        x=trades[trades.type == tradetype]['created'],
        y=trades[trades.type == tradetype]['trade_latest_price'],
        name=tradetype) for tradetype in sorted(trades.type.unique())]

layout = go.Layout(title='Last Traded Price by Cryptocurrency',
                  yaxis={'title': 'Last Traded Price'},
                  xaxis={'title': 'Time'})


plot(go.Figure(data=data, layout=layout))

In [None]:
events = list(client['production']['eventCollection'].find({'eventCategory': 'trade'}))

In [None]:
raw_trades = json_normalize(events)

raw_trades = raw_trades[['_id','created','metadata.email','eventAction','eventLabel','metadata.instrument','metadata.tradesResponse','metadata.lastTradedPx']]

raw_trades = raw_trades[raw_trades['metadata.tradesResponse'] == "Accepted"]

raw_trades = raw_trades.sort_values(by='created')

raw_trades['previous_metadata.lastTradedPx'] = raw_trades.groupby('metadata.instrument')['metadata.lastTradedPx'].shift(1)

raw_trades['previous_eventAction'] = raw_trades.groupby('metadata.instrument')['eventAction'].shift(1)
raw_trades['previous_eventLabel'] = raw_trades.groupby('metadata.instrument')['eventLabel'].shift(1)

raw_trades['percentage_price_difference'] = ((raw_trades['metadata.lastTradedPx'] - raw_trades['previous_metadata.lastTradedPx'])/raw_trades['previous_metadata.lastTradedPx']).astype(float)

raw_trades = raw_trades[np.abs(raw_trades.percentage_price_difference) > 0.5]

pd.set_option('display.float_format', lambda x: '%.5f' % x)

raw_trades.to_csv('large_price_fluctuations.csv', index=False)

In [None]:
data = [go.Scatter(
        x=raw_trades[raw_trades['metadata.instrument'] == tradetype]['created'],
        y=raw_trades[raw_trades['metadata.instrument'] == tradetype]['metadata.lastTradedPx'],
        name=tradetype) for tradetype in sorted(raw_trades['metadata.instrument'].unique())]

layout = go.Layout(title='Last Traded Price by Cryptocurrency',
                  yaxis={'title': 'Last Traded Price'},
                  xaxis={'title': 'Time'})


plot(go.Figure(data=data, layout=layout))