In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

from mc import MarkovChain

# Read raw click data and list of users ids (developers etc removed)

In [2]:
raw_data = pd.read_csv('../data/raw_data.csv')
raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'].astype(int)*1e6)
user_ids = pd.read_csv('../data/filtered_user_ids.csv')

In [3]:
print(len(raw_data))
raw_data.sample(5)

4853


Unnamed: 0,userId,event_type,timestamp,event_target
2121,2BIBN8PbtIRzILF5y973CpwRjCj1,pageview,2017-09-14 14:55:45.113999872,bugreport
4239,tyV1UFX0VWZoIEU3cxUFrTUswQf1,pageview,2017-09-22 19:34:48.012999936,routes
76,6z1kJ4cJ92ff5Tl7wEr0wMFkvoH3,pageview,2017-09-01 18:14:09.513999872,routes
54,cBUljslhysPrydSekqU8pWTx69o2,pageview,2017-09-01 16:41:22.104999936,coach
2033,f61SusxeqNamahMyofCaytWgMsi2,pageview,2017-09-14 12:04:14.212000000,route


# Filter out all user with id in the provided list

In [4]:
raw_data = raw_data[raw_data['userId'].isin(user_ids['user id'])]

In [5]:
print(len(raw_data))

4395


# Create the markov chain by segmenting different sessions from the raw data

In [6]:
markov_chain = MarkovChain()
sessions_per_user = defaultdict(list)
SESSION_SECONDS = 20*60
for user in np.unique(raw_data['userId']):
    # Filtered all logs from a specific user from the raw data
    user_logs = raw_data[raw_data['userId'] == user]
    # Sort it by timestamp
    user_logs = user_logs.sort_values(by='timestamp').reset_index(drop=True)
    current_session = []
    for i in range(len(user_logs) - 1):
        current_session.append((user_logs.iloc[i, :]['event_target'], user_logs.iloc[i, :]['timestamp']))
        time_diff = user_logs.iloc[i+1, :]['timestamp'] - user_logs.iloc[i, :]['timestamp']
        # If it takes longer than SESSION SECONDS between two actions (i and i+1), they are considered from two 
        # sessions, IF the the target of (i+1) is equal to coach (the start page) or when action (i) is not equal
        # to track (in which case, they are tracking a route while cycling, which can take a lot of time)
        if (time_diff.total_seconds() >= SESSION_SECONDS 
            and user_logs.iloc[i+1, :]['event_target'] == 'coach' and user_logs.iloc[i, :]['event_target'] != 'track'):
            # Append the two surrogate states
            current_session[0] = ('start', current_session[0][1])
            current_session.append(('exit', current_session[-1][1]))
            markov_chain.add_sequence([x[0] for x in current_session])
            sessions_per_user[user].append(current_session)
            current_session = []

# Generate different metrics (such as #unique page views, time on page and pagerank)

In [7]:
time_on_pages = defaultdict(list)
unique_page_views = defaultdict(int)
page_views = defaultdict(int)
for user in sessions_per_user:
    for session in sessions_per_user[user]:
        unique_views = defaultdict(lambda: False)
        for i in range(len(session) - 2):
            unique_views[session[i][0]] = True
            time_on_pages[session[i][0]].append((session[i+1][1] - session[i][1]).total_seconds())
            page_views[session[i][0]] += 1
        unique_views[session[-2][0]] = True
        unique_views[session[-1][0]] = True
        page_views[session[-2][0]] += 1
        page_views[session[-1][0]] += 1
        for page in unique_views:
            if unique_views[page]:
                unique_page_views[page] += 1
                
for page in time_on_pages:
    time_on_pages[page] = np.sqrt(np.mean(time_on_pages[page]))
time_on_pages['exit'] = 0

In [8]:
# Use networkx for PageRank calculation 
import networkx as nx

g = nx.DiGraph()
transition_matrix = markov_chain.get_transition_matrix()
for from_page in transition_matrix:
    for to_page in transition_matrix:
        g.add_edge(from_page, to_page, weight=transition_matrix[from_page][to_page])
        
pageranks = nx.pagerank(g)
pageranks['exit'] = 0

In [9]:
vectors = []
for page in list(markov_chain.get_transition_matrix().keys()) + ['exit']:
    vectors.append([page, page_views[page], unique_page_views[page], pageranks[page]])
metrics_df = pd.DataFrame(vectors, columns=['page', 'pageviews', 'unique_pageviews', 'pagerank'])
metrics_df = metrics_df.set_index('page')
metrics_df['pageviews'] /= metrics_df['pageviews'].sum()
metrics_df['unique_pageviews'] /= metrics_df['unique_pageviews'].sum()
metrics_df.sample(5)

Unnamed: 0_level_0,pageviews,unique_pageviews,pagerank
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
team,0.111923,0.117689,0.124534
bugreport,0.012111,0.012407,0.031835
routes,0.110461,0.058844,0.143367
start,0.106912,0.181496,0.015
exit,0.106912,0.181496,0.0


# Serialize the Markov Chain in JSON, such that it can be used by D3.js

In [10]:
json_data = markov_chain.to_json(time_on_pages, '../visualization/visualization_data.json', metrics_df)