In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

from mc import MarkovChain

# Read raw click data and list of users ids (developers etc removed)

In [2]:
raw_data = pd.read_csv('../data/raw_data.csv')
raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'].astype(int)*1e6)
user_ids = pd.read_csv('../data/filtered_user_ids.csv')

In [3]:
print(len(raw_data))
raw_data.sample(5)

4853


Unnamed: 0,userId,event_type,timestamp,event_target
768,MgR266qmCCRe5u26SHUpdMkSSvE3,pageview,2017-09-07 18:11:52.511000064,personal
1339,undefined,pageview,2017-09-11 15:59:57.521999872,team
1132,zwDvjrxbwXR6wQrcgsbDuxkjpsc2,pageview,2017-09-10 16:37:23.993999872,route
2678,fTxD7yHvsGgHyJzpC2oexLpWsmh1,pageview,2017-09-15 21:07:38.584999936,track
4335,tyV1UFX0VWZoIEU3cxUFrTUswQf1,pageview,2017-09-23 10:03:05.776000000,route


In [20]:
"""
'track': defaultdict(<class 'int'>, {'coach': 0.3033932135728543, 'route': 0.36926147704590817, 
'track': 0.027944111776447105, 'team': 0.1317365269461078, 'competition': 0.033932135728542916, 
'personal': 0.059880239520958084, 'badges': 0.007984031936127744, 'routes': 0.0658682634730539})
"""
sorted_data = raw_data.sort_values(by=['userId', 'timestamp']).reset_index(drop=True)
for i, row in sorted_data.iterrows():
    if row['event_target'] == 'track':
        print((sorted_data.iloc[i+1, :]['timestamp'] - row['timestamp']).total_seconds() / 60)

9.501466666666666
54.58278333013333
0.06185000106666667
54.601800000000004
0.06031666346666666
0.052083332266666664
81.74600000000001
0.031916667733333334
0.04253333333333333
54.8141833344
0.03429999786666667
85.33303333120001
0.022433335466666666
0.042883332266666664
46.889699997866664
7.667233335466666
54.7762
0.02546666666666667
6.357499997866666
139.6491500032
54.8064166656
0.06313333333333333
54.9240166656
0.0426499968
11.469883332266667
0.036883332266666666
0.2764166656
0.13691666773333333
0.0701833344
22.6753833344
34.67101666986667
219.1389666688
0.0555833344
28.873850001066668
0.04150000213333334
47.821683332266666
0.03210000213333333
31.8304
3.1017666688000003
0.34203333119999996
0.7832499968000001
0.13454999893333333
19.7309500032
0.12426666666666668
0.07400000000000001
2.2833500032
0.4965333333333334
0.3118666666666667
0.15886666666666666
0.32274999893333334
1110.6449333333333
0.1125833344
57.29626666666666
0.12094999893333333
0.0640499968
54.05598333013334
0.13288333653333

IndexError: single positional indexer is out-of-bounds

# Filter out all user with id in the provided list

In [4]:
raw_data = raw_data[raw_data['userId'].isin(user_ids['user id'])]

In [5]:
print(len(raw_data))

4395


# Create the markov chain by segmenting different sessions from the raw data

In [17]:
markov_chain = MarkovChain()
sessions_per_user = defaultdict(list)
SESSION_SECONDS = 20*60
sessions_vectors = []
sessions_cntr = 0
for user in np.unique(raw_data['userId']):
    # Filtered all logs from a specific user from the raw data
    user_logs = raw_data[raw_data['userId'] == user]
    # Sort it by timestamp
    user_logs = user_logs.sort_values(by='timestamp').reset_index(drop=True)
    current_session = []
    for i in range(len(user_logs) - 1):
        current_session.append((user_logs.iloc[i, :]['event_target'], user_logs.iloc[i, :]['timestamp']))
        time_diff = user_logs.iloc[i+1, :]['timestamp'] - user_logs.iloc[i, :]['timestamp']
        # If it takes longer than SESSION SECONDS between two actions (i and i+1), they are considered from two 
        # sessions, IF the the target of (i+1) is equal to coach (the start page) or when action (i) is not equal
        # to track (in which case, they are tracking a route while cycling, which can take a lot of time)
        if (time_diff.total_seconds() >= SESSION_SECONDS 
            and user_logs.iloc[i+1, :]['event_target'] == 'coach' and user_logs.iloc[i, :]['event_target'] != 'track'):
            if len(current_session) == 1:
                sessions_vectors.append([current_session[0][0], np.NaN, 
                                         sessions_cntr, current_session[0][1]])
            else:
                for k in range(len(current_session) - 1):
                    sessions_vectors.append([current_session[k][0], current_session[k + 1][0], 
                                             sessions_cntr, current_session[k][1]])
            sessions_cntr += 1
            # Append the two surrogate states
            current_session[0] = ('start', current_session[0][1])
            current_session.append(('exit', current_session[-1][1]))
            markov_chain.add_sequence([x[0] for x in current_session])
            sessions_per_user[user].append(current_session)
            current_session = []
pickle.dump(sessions_per_user, open('../data/sessions.p', 'wb'))
sessions_df = pd.DataFrame(sessions_vectors)
sessions_df.columns = ['from', 'to', 'session_id', 'timestamp']
sessions_df.to_csv('../data/actions.csv', index=False)
print(markov_chain.get_transition_matrix())

defaultdict(<function MarkovChain.get_transition_matrix.<locals>.<lambda> at 0x7f28d3fe4510>, {'start': defaultdict(<class 'int'>, {'personal': 0.080078125, 'team': 0.23046875, 'competition': 0.251953125, 'exit': 0.15234375, 'track': 0.2109375, 'coach': 0.037109375, 'bugreport': 0.001953125, 'badges': 0.013671875, 'routes': 0.01953125, 'route': 0.001953125}), 'personal': defaultdict(<class 'int'>, {'competition': 0.03745318352059925, 'bugreport': 0.20973782771535582, 'team': 0.13108614232209737, 'coach': 0.27340823970037453, 'track': 0.08614232209737828, 'exit': 0.1760299625468165, 'routes': 0.08239700374531835, 'route': 0.003745318352059925}), 'competition': defaultdict(<class 'int'>, {'coach': 0.30594405594405594, 'personal': 0.05244755244755245, 'exit': 0.1590909090909091, 'track': 0.05419580419580419, 'team': 0.24125874125874125, 'badges': 0.045454545454545456, 'competition': 0.04195804195804196, 'routes': 0.09265734265734266, 'route': 0.006993006993006993}), 'coach': defaultdict(<

In [26]:
total = 0
exit_cnt = 0
for user in sessions_per_user:
    for session in sessions_per_user[user]:
        total += 1
        exit_cnt += session[1][0] == 'exit'
exit_cnt / total
print([x[0] for x in sessions_per_user['0GJ0vzgFeIWedi3bTlmNXdWw0Qd2'][0] if x[0] not in ['exit', 'start']] )

['personal', 'competition', 'coach', 'team', 'coach', 'team', 'coach', 'team', 'routes', 'coach', 'track', 'coach', 'badges', 'competition', 'personal', 'bugreport', 'personal', 'bugreport', 'team', 'competition']


# Generate different metrics (such as #unique page views, time on page and pagerank)

In [7]:
time_on_pages = defaultdict(list)
unique_page_views = defaultdict(int)
page_views = defaultdict(int)
for user in sessions_per_user:
    for session in sessions_per_user[user]:
        unique_views = defaultdict(lambda: False)
        for i in range(len(session) - 2):
            unique_views[session[i][0]] = True
            time_on_pages[session[i][0]].append((session[i+1][1] - session[i][1]).total_seconds())
            page_views[session[i][0]] += 1
        unique_views[session[-2][0]] = True
        unique_views[session[-1][0]] = True
        page_views[session[-2][0]] += 1
        page_views[session[-1][0]] += 1
        for page in unique_views:
            if unique_views[page]:
                unique_page_views[page] += 1
                
for page in time_on_pages:
    time_on_pages[page] = np.sqrt(np.mean(time_on_pages[page]))
time_on_pages['exit'] = 0

In [8]:
# Use networkx for PageRank calculation 
import networkx as nx

g = nx.DiGraph()
transition_matrix = markov_chain.get_transition_matrix()
for from_page in transition_matrix:
    for to_page in transition_matrix:
        g.add_edge(from_page, to_page, weight=transition_matrix[from_page][to_page])
        
pageranks = nx.pagerank(g)
pageranks['exit'] = 0

In [9]:
vectors = []
for page in list(markov_chain.get_transition_matrix().keys()) + ['exit']:
    vectors.append([page, page_views[page], unique_page_views[page], pageranks[page]])
metrics_df = pd.DataFrame(vectors, columns=['page', 'pageviews', 'unique_pageviews', 'pagerank'])
metrics_df = metrics_df.set_index('page')
metrics_df['pageviews'] /= metrics_df['pageviews'].sum()
metrics_df['unique_pageviews'] /= metrics_df['unique_pageviews'].sum()
metrics_df.sample(5)

Unnamed: 0_level_0,pageviews,unique_pageviews,pagerank
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
track,0.104615,0.070897,0.121387
routes,0.110461,0.058844,0.143367
route,0.11422,0.072315,0.141524
badges,0.024849,0.037575,0.043418
personal,0.055753,0.0553,0.076483


# Serialize the Markov Chain in JSON, such that it can be used by D3.js

In [10]:
json_data = markov_chain.to_json(time_on_pages, '../visualization/visualization_data.json', metrics_df)