In [1]:
import pandas as pd
import datetime

In [8]:
def load_for_matrix():
    global df
    df = pd.read_csv('data/total.csv')
    df.rename(columns= {'Unnamed: 0': 'date'}, inplace=True)
    df.date = pd.to_datetime(df.date)
    df.set_index('date', inplace=True)
    df = df.groupby('day_id').resample('1Min').ffill()
    df = pd.DataFrame(df)
    return df
load_for_matrix()

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,location,day,day_id
day_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fri-1,2019-09-06 07:00:00,1,dairy,friday,fri-1
fri-1,2019-09-06 07:01:00,1,dairy,friday,fri-1
fri-1,2019-09-06 07:02:00,1,dairy,friday,fri-1
fri-1,2019-09-06 07:03:00,1,dairy,friday,fri-1
fri-1,2019-09-06 07:04:00,1,spices,friday,fri-1
...,...,...,...,...,...
wed-998,2019-09-04 16:57:00,998,fruit,wednesday,wed-998
wed-998,2019-09-04 16:58:00,998,fruit,wednesday,wed-998
wed-998,2019-09-04 16:59:00,998,checkout,wednesday,wed-998
wed-999,2019-09-04 16:53:00,999,fruit,wednesday,wed-999


In [9]:
def generate_prob_matrix(df):
    """This function takes a dataframe with customer movements over time inside the store. It generates a probability matrix of size N x N. where N is the total number of locations in the store. The matrix contains the probabilities that a customer will go to the next location based on the current one within the time defined by the timestep in the dataframe. It is meant to be used further in a Markov chain Monte Carlo simulation.

    INPUT ARGUMENTS:
        -df -> pandas DataFrame with a timedate index, customer_ID and location at every timestep.

    OUTPUT:
        -prob_matrix -> matrix where each element is the probability to transition from one state to another in a Markov chain simulation. 
    """
    
    # create empty series that will be populated with the current and previous locations for each customer
    loc_now = pd.Series(dtype=object)
    loc_before = pd.Series(dtype=object)
    
    # iterate through all customer IDs
    for customer_id in df.day_id.unique():
        
        # find the current and previous locations for just the current customer
        loc_per_customer = df[df['day_id'] ==  customer_id]['location']
        loc_per_customer_before = loc_per_customer.shift(1)
        
        # shifting data will create NaN values for previous locations; they will be replaced with 'entrance'
        loc_per_customer_before.fillna('entrance',
                               inplace=True)
        
        # add the current and previous locations to the series
        loc_now = pd.concat([loc_now,
                            loc_per_customer])
        loc_before = pd.concat([loc_before,
                                loc_per_customer_before])    
    
    # generate transition probability matrix
    prob_matrix = pd.crosstab(loc_before, 
                            loc_now,
                            normalize='index')
    print(prob_matrix)
    return [loc_before, loc_now]

In [10]:
generate_prob_matrix(df)

col_0     checkout     dairy    drinks     fruit    spices
row_0                                                     
dairy     0.102697  0.738706  0.058134  0.049478  0.050986
drinks    0.215334  0.010880  0.599199  0.087755  0.086832
entrance  0.000000  0.287576  0.153526  0.377435  0.181464
fruit     0.200564  0.095428  0.054564  0.599029  0.050415
spices    0.149613  0.191839  0.161948  0.090305  0.406294


[(fri-1, 2019-09-06 07:00:00)      entrance
 (fri-1, 2019-09-06 07:01:00)         dairy
 (fri-1, 2019-09-06 07:02:00)         dairy
 (fri-1, 2019-09-06 07:03:00)         dairy
 (fri-1, 2019-09-06 07:04:00)         dairy
                                     ...   
 (wed-998, 2019-09-04 16:57:00)       dairy
 (wed-998, 2019-09-04 16:58:00)       fruit
 (wed-998, 2019-09-04 16:59:00)       fruit
 (wed-999, 2019-09-04 16:53:00)    entrance
 (wed-999, 2019-09-04 16:54:00)       fruit
 Length: 54192, dtype: object,
 (fri-1, 2019-09-06 07:00:00)         dairy
 (fri-1, 2019-09-06 07:01:00)         dairy
 (fri-1, 2019-09-06 07:02:00)         dairy
 (fri-1, 2019-09-06 07:03:00)         dairy
 (fri-1, 2019-09-06 07:04:00)        spices
                                     ...   
 (wed-998, 2019-09-04 16:57:00)       fruit
 (wed-998, 2019-09-04 16:58:00)       fruit
 (wed-998, 2019-09-04 16:59:00)    checkout
 (wed-999, 2019-09-04 16:53:00)       fruit
 (wed-999, 2019-09-04 16:54:00)    checkout
 