In [None]:
# for auto-reloading extensions - helpful if you're writing and testing a package
%reload_ext autoreload
%autoreload 2

# for inline plotting in python using matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# for easier plots - also makes matplotlib plots look nicer by default
import seaborn as sns

# set up for using plotly offline without an API key - great for interactive plots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# for numerical work
import pandas as pd
import numpy as np

import pymongo

import datetime
import json

from pandas.io.json import json_normalize
from pymongo import MongoClient

import pickle

from confluent_kafka import Producer

import bson
from bson import json_util

import avro

# load the database credentials from file
with open('../creds/creds.json') as json_data:
    creds = json.load(json_data)

# initialize the client
client = MongoClient(creds['connection_string'])

In [None]:
def get_sessions_with_request(client):

    ec = client['production']['eventCollection']

    session_ids = [event['metadata']['sessionId'] for event in ec.find({'metadata.sessionId': {'$ne': None}, 'eventAction': 'click', 'eventLabel': 'submit-purchase-request'})]
    
    return session_ids

request_sessions = get_sessions_with_request(client)

In [None]:
request_sessions = list(set(request_sessions))

In [None]:
def get_session_events(session, client):

    ec = client['production']['eventCollection']
    
    # get the events by session id
    session_events = [event for event in ec.find({'metadata.sessionId': session}).sort([('created',1)])]
    
    # get the time of the first and last event
    start_time = session_events[0]['created']
    end_time = session_events[-1]['created']
    
    # get emails from the session events if they exist
    user_emails = [event['metadata']['email'] for event in session_events if (event.get('metadata') != None and event.get('metadata').get('email') != None and event.get('metadata').get('email') != '')]
    
    # if there's emails
    if len(user_emails) > 0:

        # get the most common email
        email = max(set(user_emails)-set(['',None]), key=user_emails.count)
        print(email)

        if email not in ['', None]:
        
            # get the events by the user during the session time period but where there's no sessionId (not perfect - could break down with concurrent sessions by same user)
            events_by_email = list(ec.find({'metadata.email': email, 
                                            'created': {'$gte': start_time, '$lte': end_time}, 
                                            'metadata.sessionId': None}).sort([('created',1)]))
            
            if len(events_by_email) > 0:
                session_events += events_by_email

            for event in session_events:
                if event.get('metadata') != None:
                    event['metadata']['email'] = email
            
    return sorted(session_events, key=lambda event: event['created'])

In [None]:
get_session_events(request_sessions[0],client)