# Login Activity Stats

In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import bson
from bson import json_util

import re

# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)

# initialize the client
client = MongoClient(creds['connection_string'])

In [None]:
whitelist = [e['email'] for e in client['production']['emailWhitelistCollection'].find()]

In [None]:
def is_test_email(email):
    
    if (email in whitelist) or ('test' in email) or ('fingerfood' in email) or ('einstein.exchange' in email) or ('alican' in email):
        return True
    else:
        return False

In [None]:
def get_email(string):
    
    return re.findall("email='(.*?)'", string)[0]

In [None]:
ec = client['production']['eventCollection']

In [None]:
login_events = list(ec.find({'eventLabel': 'login', 'eventCategory': 'authentication'}))

In [None]:
ledf = json_normalize(login_events)

In [None]:
ledf['day'] = ledf.created.apply(lambda x: x.day)
ledf['month'] = ledf.created.apply(lambda x: x.month)
ledf['year'] = ledf.created.apply(lambda x: x.year)
ledf['hour'] = ledf.created.apply(lambda x: x.hour)

In [None]:
ledf['user_email'] = ledf['metadata.request'].apply(get_email)
ledf['testing'] = ledf.user_email.apply(is_test_email)

In [None]:
logins = ledf[['eventAction', 'created','day','month','year','hour','user_email','testing']]

In [None]:
logins[logins.testing == False].groupby(['year','month','day','hour'])['user_email'].agg(lambda x: len(x.unique())).reset_index()

## Count of number of login actions by email

In [None]:
logins.groupby(['eventAction','user_email'])['created'].count().reset_index().sort_values(['eventAction','created'], ascending=False).rename(columns={'created':'n_events'})

## Login events grouped by hour

In [None]:
grouped_by_hour = logins.groupby(['year','month','day','hour','eventAction'])['user_email'].count().reset_index()
grouped_by_hour['time'] = grouped_by_hour.apply(lambda x: datetime.datetime(x['year'],x['month'],x['day'],x['hour']), axis=1) - datetime.timedelta(hours=7)

grouped_by_hour = grouped_by_hour.rename(columns={'user_email':'n_events'})[['time','eventAction','n_events']]
grouped_by_hour

## Plot of Login Activity by Hour

In [None]:
data = [go.Scatter(
    x=grouped_by_hour[grouped_by_hour.eventAction == action]['time'], 
    y=grouped_by_hour[grouped_by_hour.eventAction == action]['n_events'],
    name = action) for action in grouped_by_hour.eventAction.unique()]

In [None]:
iplot(data)

## Reviewing Old Login Events

In [None]:
old_login_events = json_normalize(list(ec.find({'eventCategory': 'login'})))

In [None]:
subset = old_login_events[['created','eventCategory','eventAction','eventLabel','metadata.email','metadata.ip']]
subset['hour'] = subset.created.apply(lambda t: datetime.datetime(year=t.year, month=t.month, day=t.day, hour=t.hour)-datetime.timedelta(hours=7))
subset['event_type'] = subset.eventCategory+"_"+subset.eventAction+"_"+subset.eventLabel
subset = subset[['created','hour','event_type','metadata.email','metadata.ip']]

In [None]:
by_hour = subset.groupby(['event_type','hour'])['created'].count().reset_index()
by_hour = by_hour.rename(columns={'created': 'n_events'})

by_user = subset[subset.created >= datetime.datetime(2018,6,10)].groupby(['event_type','metadata.email'])['created'].count().reset_index()
by_user = by_user.rename(columns={'created': 'n_events'}).sort_values('n_events', ascending=False)

by_user_hour = subset.groupby(['event_type','hour','metadata.email'])['created'].count().reset_index()
by_user_hour = by_user_hour.rename(columns={'created': 'n_events'}).sort_values('n_events', ascending=False)
most_active_user_by_hour = by_user_hour.groupby(['event_type','hour']).apply(lambda x: (x['metadata.email'][x['n_events'].idxmax()],x['n_events'][x['n_events'].idxmax()])).reset_index().sort_values('hour', ascending=False)

In [None]:
by_hour

In [None]:
data = [go.Scatter(
    x=by_hour[by_hour.event_type == action]['hour'], 
    y=by_hour[by_hour.event_type == action]['n_events'],
    name = action) for action in by_hour.event_type.unique()]

iplot(data)

In [None]:
june_events = list(ec.find({'created': {'$gte': datetime.datetime(2018,6,1)}}))

In [None]:
jedf = json_normalize(june_events)

In [None]:
clean = jedf

clean.loc[clean.eventCategory == 'session', 'eventLabel'] =''
clean['event_type'] = clean.eventCategory+"_"+clean.eventAction+"_"+clean.eventLabel
clean['event_type_higher'] = clean.eventCategory+"_"+clean.eventAction
clean['hour'] = clean.created.apply(lambda t: datetime.datetime(year=t.year, month=t.month, day=t.day, hour=t.hour)-datetime.timedelta(hours=7))

In [None]:
clean = clean[['created','hour','event_type','event_type_higher','metadata.email']]

In [None]:
event_type_by_hour = clean.groupby(['hour','event_type_higher'])['created'].count().reset_index().rename(columns={'created': 'n_events'})
event_type_by_hour_low = clean.groupby(['hour','event_type'])['created'].count().reset_index().rename(columns={'created': 'n_events'})

In [None]:
def text(series, name):
    
    return ['Events: {} \nEvent Type: '.format(n, name) for n in series]

data = [go.Scatter(
    x=event_type_by_hour[event_type_by_hour.event_type_higher == action]['hour'], 
    y=z_score(event_type_by_hour[event_type_by_hour.event_type_higher == action]['n_events']),
    name = action,
    text = text(event_type_by_hour[event_type_by_hour.event_type_higher == action]['n_events'], action)) for action in sorted(event_type_by_hour.event_type_higher.unique())]

data_low = [go.Scatter(
    x=event_type_by_hour_low[event_type_by_hour_low.event_type == action]['hour'], 
    y=z_score(event_type_by_hour_low[event_type_by_hour_low.event_type == action]['n_events']),
    name = action,
    text = text(event_type_by_hour_low[event_type_by_hour_low.event_type == action]['n_events'], action)) for action in sorted(event_type_by_hour_low.event_type.unique())]

data = data+data_low

fig = go.Figure(data=data, layout=go.Layout(title='Event Counts per Hour'))

plot(fig, filename='event_types_by_hour.html')

In [None]:
def z_score(series):
    
    mean = np.mean(series)
    std = np.std(series)
    
    zs = (series - mean)/std
    
    return zs