In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import avro

# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)

# initialize the client
client = MongoClient(creds['connection_string'])

In [None]:
def get_sessions_starts(client):

    ec = client['production']['eventCollection']

    session_ids = [{'sessionId': event['metadata']['sessionId'], 'created': event['created'], 'ip': event['metadata'].get('ip')} for event in ec.find({'eventAction': 'start'})]
    
    return session_ids

sessions_with_time = get_sessions_starts(client)

In [None]:
def get_sessions_with_user(client):

    ec = client['production']['eventCollection']

    session_ids = [{'sessionId': event['metadata']['sessionId'], 'email': event['metadata'].get('email')} for event in ec.find({'metadata.sessionId': {'$ne': None}, 'metadata.email': {'$ne': None, '$ne': ''}})]
    
    return pd.DataFrame(session_ids)

user_sessions = get_sessions_with_user(client)
user_sessions = user_sessions[user_sessions.email.isna() == False]
user_sessions = user_sessions.drop_duplicates()

In [None]:
def get_whitelist(client):

    ec = client['production']['emailWhitelistCollection']
    
    wl = [e['email'] for e in ec.find()]
    
    return wl

whitelist = get_whitelist(client)
ip_filter = ['172.255.50.90','97.107.179.133','97.107.179.134','207.216.30.192','172.22.0.138','172.22.0.185','97.107.183.77','174.119.233.32','::1','localhost','127.0.0.1']

In [None]:
def is_test_email(email):
    
    if (email in whitelist) or ('test' in email) or ('fingerfood' in email) or ('einstein' in email):
        return True
    else:
        return False

In [None]:
sessions = pd.DataFrame(sessions_with_time)
sessions['date'] = sessions.created.apply(lambda date: "-".join([str(date.year),str(date.month),str(date.day)]))
sessions = sessions.set_index('sessionId').join(user_sessions.set_index('sessionId')).reset_index()
sessions.ip.replace(to_replace=['',None,'None',np.nan],value='noip',inplace=True)
sessions.email.replace(to_replace=['',None,'None',np.nan],value='noemail',inplace=True)

sessions['test_ip'] = sessions.ip.apply(lambda ip: True if ip in ip_filter else False)
sessions['test_user'] = sessions.email.apply(is_test_email)
sessions['test'] = sessions.test_ip | sessions.test_user
sessions

In [None]:
email_sessions = sessions[sessions.email != 'noemail'][['date','email']].drop_duplicates()
email_sessions = email_sessions[email_sessions.email.isin(whitelist) == False]
email_sessions = email_sessions[email_sessions.email.str.contains('test') == False]
email_sessions = email_sessions[email_sessions.email.str.contains('fingerfood') == False]
email_sessions = email_sessions[email_sessions.email.str.contains('einstein') == False]
summary_emails_by_date = email_sessions.groupby('date')['email'].count().reset_index()
summary_emails_by_date['date'] = pd.to_datetime(summary_by_date.date)
summary_emails_by_date = summary_emails_by_date.sort_values('date')

In [None]:
email_sessions = sessions[(sessions.email != 'noemail') & (sessions.test == False)][['date','email']].drop_duplicates()
email_sessions = email_sessions[email_sessions.email.isin(whitelist) == False]
email_sessions = email_sessions[email_sessions.email.str.contains('test') == False]
email_sessions = email_sessions[email_sessions.email.str.contains('fingerfood') == False]
email_sessions = email_sessions[email_sessions.email.str.contains('einstein') == False]
summary_emails_by_date_test = email_sessions.groupby('date')['email'].count().reset_index()
summary_emails_by_date_test['date'] = pd.to_datetime(summary_emails_by_date_test.date)
summary_emails_by_date_test = summary_emails_by_date_test.sort_values('date')

In [None]:
noemail_sessions = sessions[sessions.email == 'noemail'][['date','sessionId']].drop_duplicates()
summary_noemails_by_date = noemail_sessions.groupby('date')['sessionId'].count().reset_index()
summary_noemails_by_date['date'] = pd.to_datetime(summary_noemails_by_date.date)
summary_noemails_by_date = summary_noemails_by_date.sort_values('date')

noemail_sessions = sessions[(sessions.email == 'noemail') & (sessions.test == False)][['date','sessionId']].drop_duplicates()
summary_noemails_by_date_test = noemail_sessions.groupby('date')['sessionId'].count().reset_index()
summary_noemails_by_date_test['date'] = pd.to_datetime(summary_noemails_by_date_test.date)
summary_noemails_by_date_test = summary_noemails_by_date_test.sort_values('date')

In [None]:
combined = summary_emails_by_date_test.set_index('date').join(summary_noemails_by_date_test.set_index('date'))
combined.columns = ['logged_in_users','sessions_no_user']
combined.to_csv('session_activity.csv')

In [None]:
# Create a trace
users = go.Scatter(
    x = summary_emails_by_date.date,
    y = summary_emails_by_date.email,
    name = 'Logged In Users Including Tests'
)

userst = go.Scatter(
    x = summary_emails_by_date_test.date,
    y = summary_emails_by_date_test.email,
    name = 'Logged In Users Excluding Tests'
)

anon = go.Scatter(
    x = summary_noemails_by_date.date,
    y = summary_noemails_by_date.sessionId,
    name = 'Not Logged In Sessions Including Tests'
)

anont = go.Scatter(
    x = summary_noemails_by_date_test.date,
    y = summary_noemails_by_date_test.sessionId,
    name = 'Not Logged In Sessions Excluding Tests'
)


users_layout = go.Layout(title='Unique Daily Users')
anon_layout = go.Layout(title='Unique Sessions With No User Email')

user_data = [users, userst]
anon_data = [anon, anont]

plot(go.Figure(data=user_data, layout=users_layout), filename='logged_in_users.html')
plot(go.Figure(data=anon_data, layout=anon_layout), filename='no_user_sessions.html')