In [22]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

from mc import MarkovChain

In [23]:
def _calculate_markov_chain(sequences, assignment, n_clusters):
    markov_chains = [MarkovChain() for _ in range(n_clusters)]
    for sequence in sequences:
        markov_chains[assignment[tuple(sequence)]].add_sequence(sequence)
    return markov_chains
                
def sequence_clustering(sequences, n_clusters=2):
    # http://web.ist.utl.pt/diogo.ferreira/papers/ferreira07approaching.pdf
    
    # Initialization: we initialize two random markov chains
    cluster_initialization = np.random.choice(range(n_clusters), size=len(sequences))
    initial_assignment = {}
    for i, sequence in enumerate(sequences): 
        initial_assignment[tuple(sequence)] = cluster_initialization[i]
        
    markov_chains = _calculate_markov_chain(sequences, initial_assignment, n_clusters)
        
    # Now until there is (almost) no change:
    #     Iterate over all sequences and assign sequence to markov chain with highest likelihood (E)
    #     Recalculate the markov chains by using the assigned sequences (M)
    prev_assignment = None
    assignment = None
    while assignment == None or assignment != prev_assignment:
        prev_assignment = assignment
        assignment = {}
        for sequence in sequences:
            assignment[tuple(sequence)] = np.argmax([mc.get_probability(sequence) for mc in markov_chains])
            
        markov_chains = _calculate_markov_chain(sequences, assignment, n_clusters)
        
    return assignment, markov_chains

In [29]:
sessions = pickle.load(open('../data/sessions.p', 'rb'))
sequences = list(sessions.values()) # we do not care about the user ids
clean_sequences = []
for user_sequences in sequences:
    for seq in user_sequences:
        clean_sequences.append([x[0] for x in seq])
clustering_assignment, markov_chains = sequence_clustering(clean_sequences)

In [36]:
pages = list(markov_chains[0].get_transition_matrix().keys()) + ['exit']
time_on_pages = {}
vectors = []
for page in pages:
    time_on_pages[page] = 0
    vectors.append([page, 0.05])
metrics_df = pd.DataFrame(vectors, columns=['page', 'unit_size'])
metrics_df = metrics_df.set_index('page')
markov_chains[0].to_json(time_on_pages, '../visualization/mc_cluster0.json', metrics_df)
markov_chains[1].to_json(time_on_pages, '../visualization/mc_cluster1.json', metrics_df)