# Parse Experimental Results & Generate Latex Tables

<a id='toc'/>

In [1]:
import os, pickle, types
import numpy as np
import pandas as pd
import cython

In [2]:
data_dir = 'data'

In [3]:
dat_suffix = ['Edin', 'Glas', 'Melb', 'Osak', 'Toro']
dat_names = ['Edinburgh', 'Glasgow', 'Melbourne', 'Osaka', 'Toronto']

In [4]:
methods_all = ['\\textsc{Random}', '\\textsc{PersTour}', '\\textsc{PersTour-L}', '\\textsc{PoiPopularity}', \
               '\\textsc{PoiRank}', '\\textsc{Markov}', '\\textsc{MarkovPath}', \
               '\\textsc{Rank+Markov}', '\\textsc{Rank+MarkovPath}']

## Latex Table for Recommendation Results

Generate results filenames.

In [5]:
def gen_fname(dat_ix):
    assert(0 <= dat_ix < len(dat_suffix))
    
    suffix = dat_suffix[dat_ix] + '.pkl'
    
    frank = os.path.join(data_dir, 'rank-' + suffix)
    ftran = os.path.join(data_dir, 'tran-' + suffix)
    fcomb = os.path.join(data_dir, 'comb-' + suffix)
    frand = os.path.join(data_dir, 'rand-' + suffix)
    fijcai = os.path.join(data_dir, 'ijcai-' + dat_suffix[dat_ix] + '.pkl')
    return frank, ftran, fcomb, frand, fijcai

Compute the F1 score for recommended trajectory.

In [6]:
def calc_F1(traj_act, traj_rec, noloop=False):
    '''Compute recall, precision and F1 for recommended trajectories'''
    assert(isinstance(noloop, bool))
    assert(len(traj_act) > 0)
    assert(len(traj_rec) > 0)
    
    if noloop == True:
        intersize = len(set(traj_act) & set(traj_rec))
    else:
        match_tags = np.zeros(len(traj_act), dtype=np.bool)
        for poi in traj_rec:
            for j in range(len(traj_act)):
                if match_tags[j] == False and poi == traj_act[j]:
                    match_tags[j] = True
                    break
        intersize = np.nonzero(match_tags)[0].shape[0]
        
    recall = intersize / len(traj_act)
    precision = intersize / len(traj_rec)
    F1 = 2 * precision * recall / (precision + recall)
    return F1

Compute the pairs-F1 score for recommended trajectory.

In [7]:
%load_ext Cython

In [8]:
%%cython
import numpy as np
cimport numpy as np

cpdef float calc_pairsF1(y, y_hat):
    assert(len(y) > 0)
    assert(len(y) == len(set(y))) # no loops in y
    cdef int n, nr, n0, n0r, nc, poi1, poi2, i, j
    n = len(y)
    nr = len(y_hat)
    n0 = n*(n-1) // 2
    n0r = nr*(nr-1) // 2
    
    # y determines the correct visiting order
    order_dict = dict()
    for i in range(n):
        order_dict[y[i]] = i
        
    nc = 0
    for i in range(nr):
        poi1 = y_hat[i]
        for j in range(i+1, nr):
            poi2 = y_hat[j]
            if poi1 in order_dict and poi2 in order_dict and poi1 != poi2:
                if order_dict[poi1] < order_dict[poi2]: nc += 1

    cdef float precision, recall, F1
    precision = (1.0 * nc) / (1.0 * n0r)
    recall = (1.0 * nc) / (1.0 * n0)
    if nc == 0:
        F1 = 0
    else:
        F1 = 2. * precision * recall / (precision + recall)
    return F1

Load results data.

In [9]:
def load_results(dat_ix):
    assert(0 <= dat_ix < len(dat_suffix))
    
    frank, ftran, fcomb, frand, fijcai = gen_fname(dat_ix)
    #print(frank)
    assert(os.path.exists(frank))
    #print(ftran)
    assert(os.path.exists(ftran))
    #print(fcomb)
    assert(os.path.exists(fcomb))
    #print(frand)
    assert(os.path.exists(frand))
    #print(fijcai)
    assert(os.path.exists(fijcai))

    # load results data
    recdict_rank = pickle.load(open(frank, 'rb'))
    recdict_tran = pickle.load(open(ftran, 'rb'))
    recdict_comb = pickle.load(open(fcomb, 'rb'))
    recdict_rand = pickle.load(open(frand, 'rb'))
    recdict_ijcai = pickle.load(open(fijcai, 'rb'))
    
    return recdict_rank, recdict_tran, recdict_comb, recdict_rand, recdict_ijcai

Calculate F1-scores from loaded results.

In [10]:
def calc_metrics(recdict_rank, recdict_tran, recdict_comb, recdict_rand, recdict_ijcai, func):
    assert(isinstance(func, types.FunctionType) or isinstance(func, types.BuiltinFunctionType))
    
    # deal with missing values: 
    # get rid of recommendation that not all method are successful, due to ILP timeout.
    assert(np.all(sorted(recdict_rank.keys()) == sorted(recdict_tran.keys())))
    assert(np.all(sorted(recdict_rank.keys()) == sorted(recdict_comb.keys())))
    
    keys_all = sorted(recdict_ijcai.keys() & recdict_rank.keys())
    
    rank1 = []; rank2 = []
    for key in keys_all:
        rank1.append(func(recdict_rank[key]['REAL'], recdict_rank[key]['REC_POP']))
        rank2.append(func(recdict_rank[key]['REAL'], recdict_rank[key]['REC_FEATURE']))
    
    tran1 = []; tran2 = []
    for key in keys_all:
        tran1.append(func(recdict_tran[key]['REAL'], recdict_tran[key]['REC_DP']))
        tran2.append(func(recdict_tran[key]['REAL'], recdict_tran[key]['REC_ILP']))

    comb1 = []; comb2 = []
    for key in keys_all:
        comb1.append(func(recdict_comb[key]['REAL'], recdict_comb[key]['REC_DP']))
        comb2.append(func(recdict_comb[key]['REAL'], recdict_comb[key]['REC_ILP']))
            
    rand = []
    for key in keys_all:
        rand.append(func(recdict_rand[key]['REAL'], recdict_rand[key]['REC_RAND']))
    
    ijcai05T = []; ijcai05L = []
    for key in keys_all:
        ijcai05T.append(func(recdict_ijcai[key]['REAL'], recdict_ijcai[key]['REC05T']))
        ijcai05L.append(func(recdict_ijcai[key]['REAL'], recdict_ijcai[key]['REC05L']))
    
    metrics = [rand, ijcai05T, ijcai05L, rank1, rank2, tran1, tran2, comb1, comb2]
    means = [np.mean(x) for x in metrics]
    stds  = [np.std(x)  for x in metrics]
    
    # new code
    lens = [len(x) for x in metrics]
    
    # return means, stds

    return means, stds, lens

Generate Latex tables from calculated metrics.

In [11]:
def gen_latex_table(mean_df, std_df, ismax_df, ismax2nd_df, title, label):    
    strs = []
    strs.append('\\begin{table*}[t]\n')
    strs.append('\\caption{' + title + '}\n')
    strs.append('\\label{' + label + '}\n')
    strs.append('\\centering\n')
    strs.append('\\begin{tabular}{l|' + (mean_df.shape[1])*'c' + '} \\hline\n')
    for col in mean_df.columns:
        strs.append(' & ' + col)
    strs.append(' \\\\ \\hline\n')
    for ix in mean_df.index:
        for j in range(mean_df.shape[1]):
            if j == 0: strs.append(ix + ' ')
            jx = mean_df.columns[j]
            strs.append('& $')
            if ismax_df.loc[ix, jx] == True: strs.append('\\mathbf{')
            if ismax2nd_df.loc[ix, jx] == True: strs.append('\\mathit{')
            strs.append('%.3f' % mean_df.loc[ix, jx] + '\\pm' + '%.3f' % std_df.loc[ix, jx])
            if ismax_df.loc[ix, jx] == True or ismax2nd_df.loc[ix, jx] == True: strs.append('}')
            strs.append('$ ')
        strs.append('\\\\\n')
    strs.append('\\hline\n')
    strs.append('\\end{tabular}\n')
    strs.append('\\end{table*}\n')
    return ''.join(strs)

Generate evaluation data tables.

In [12]:
func = calc_F1
# func = calc_pairsF1

In [13]:
methods = methods_all.copy() 

mean_df = pd.DataFrame(data=np.zeros((len(methods), len(dat_names)), dtype=np.float), \
                       columns=dat_names, index=methods)
std_df  = pd.DataFrame(data=np.zeros((len(methods), len(dat_names)), dtype=np.float), \
                       columns=dat_names, index=methods)

# new code
lens_df = pd.DataFrame(data=np.zeros((len(methods), len(dat_names)), dtype=np.float), \
                       columns=dat_names, index=methods)

for dat_ix in range(len(dat_suffix)):
    recdict_rank, recdict_tran, recdict_comb, recdict_rand, recdict_ijcai = load_results(dat_ix)
    means, stds, lens = calc_metrics(recdict_rank, recdict_tran, recdict_comb, recdict_rand, recdict_ijcai, func)
    assert(len(means) == len(stds) == len(methods))
    mean_df[dat_names[dat_ix]] = means
    std_df[dat_names[dat_ix]]  = stds
    
    # new code
    lens_df[dat_names[dat_ix]]  = lens

ismax_df = pd.DataFrame(data=np.zeros(mean_df.shape, dtype=np.bool), columns=mean_df.columns, index=mean_df.index)
ismax2nd_df = ismax_df.copy()
for col in ismax_df.columns:
    indices = (-mean_df[col]).argsort().values[:2]
    ismax_df.iloc[indices[0]][col] = True
    ismax2nd_df.iloc[indices[1]][col] = True

if func == calc_F1:
    title = '''Performance comparison on five datasets in terms of F$_1$ scores. 
    The best method for each dataset (i.e., a column) is shown in bold, the second best is shown in italic.'''
    label = 'tab:f1'
else:
    title = '''Performance comparison on five datasets in terms of pairs-F$_1$ scores.
    The best method for each dataset (i.e., a column) is shown in bold, the second best is shown in italic.'''
    label = 'tab:pairf1'
strs = gen_latex_table(mean_df, std_df, ismax_df, ismax2nd_df, title, label)

print(strs)

\begin{table*}[t]
\caption{Performance comparison on five datasets in terms of F$_1$ scores. 
    The best method for each dataset (i.e., a column) is shown in bold, the second best is shown in italic.}
\label{tab:f1}
\centering
\begin{tabular}{l|ccccc} \hline
 & Edinburgh & Glasgow & Melbourne & Osaka & Toronto \\ \hline
\textsc{Random} & $0.570\pm0.139$ & $0.632\pm0.108$ & $0.558\pm0.149$ & $0.618\pm0.129$ & $0.605\pm0.118$ \\
\textsc{PersTour} & $0.656\pm0.223$ & $\mathbf{0.801\pm0.213}$ & $0.483\pm0.208$ & $0.686\pm0.231$ & $\mathbf{0.720\pm0.215}$ \\
\textsc{PersTour-L} & $0.651\pm0.143$ & $0.660\pm0.102$ & $0.576\pm0.141$ & $0.686\pm0.137$ & $0.643\pm0.113$ \\
\textsc{PoiPopularity} & $\mathbf{0.701\pm0.160}$ & $\mathit{0.745\pm0.166}$ & $0.620\pm0.136$ & $0.663\pm0.125$ & $0.678\pm0.121$ \\
\textsc{PoiRank} & $\mathit{0.700\pm0.155}$ & $0.679\pm0.123$ & $\mathit{0.637\pm0.142}$ & $0.640\pm0.135$ & $0.600\pm0.106$ \\
\textsc{Markov} & $0.645\pm0.169$ & $0.722\pm0.165$ & $0.577\pm

## Student's t-Test for Independent samples

We omitted Edinbourgh and Melbourne, because even after 16 hours, we weren't getting close to the result. To calculate and evaluate significance tests for the results above, we implemented following structure.

There is a total of 9 methods. Each method is represented by corresponding index:
* __0__ - Random baseline
* __1__ - PersTour
* __2__ - PersTour-L
* __3__ - PoiPopularity
* __4__ - PoiRank
* __5__ - Markov
* __6__ - MarkovPath
* __7__ - Rank+Markov
* __8__ - Rank+MarkovPath

Also, we have to omit Edinbourgh and Melbourne datasets, because even after 16 hours, the run wasn't finished. So there is a total of 3 cities, each city has its corresponding index:
* __0__ - Glasgow
* __1__ - Osaka
* __2__ - Toronto

In the paper, authors stated these facts about performance:

* all algorithms outperformed random baseline on all datasets
* PoiPopularity outperformed PoiRank
* Rank+Markov outperformed Markov 
* Rank+MarkovPath outperformed MarkovPath
* PersTour outperformed PersTour-L

In the cells below, we ran significance tests to confirm/reject these hypotheses.

In [14]:
from scipy.stats import ttest_ind_from_stats

cities_list = ['Glasgow', 'Osaka', 'Toronto']

methods_list = ['PersTour', 'PersTour-L', 'PoiPopularity', \
               'PoiRank', 'Markov', 'MarkovPath', \
               'Rank+Markov', 'Rank+MarkovPath']

all_results_df = pd.DataFrame(data=np.zeros((len(cities_list), len(methods_list)), dtype=np.float), \
                       columns=methods_list, index=cities_list)

# city arrays containing p-value for random vs all other methods
glasgow_pvalue = []
osaka_pvalue = []
toronto_pvalue = []
results_list = [glasgow_pvalue,osaka_pvalue, toronto_pvalue]   

# Loop over cities
for i in range(0, len(mean_df.columns)):
    # Loop over methods
    for j in range(0, len(mean_df.index)):
        # random baseline method
        if (j==0):
            # makes the actual comparisons - random vs all others
            for x in range(1,9):
                tt=ttest_ind_from_stats(mean1=mean_df.values[j,i], std1=std_df.values[j,i], nobs1=lens_df.values[j,i],
                             mean2=mean_df.values[j+x,i], std2=std_df.values[j+x,i], nobs2=lens_df.values[j+x,i],
                             equal_var=False)                     
                if i == 1:
                    glasgow_pvalue.append(round(tt.pvalue,4))                                            
                elif i == 3:
                    osaka_pvalue.append(round(tt.pvalue,4))                        
                elif i == 4:
                    toronto_pvalue.append(round(tt.pvalue,4))                        
                # print(mean_df.values[j,i],std_df.values[j,i],lens_df.values[j,i],j,i, "|", j,i , "|", tt.pvalue)

# Create Resulting Dataframe from the city result arrays
for i in range(0,len(cities_list)):
    all_results_df.loc[cities_list[i]] = results_list[i]

all_results_df.transpose().to_excel(r"data\Significance-Testing\random_vs_others.xlsx", index = True)

In [15]:
all_results_df

Unnamed: 0,PersTour,PersTour-L,PoiPopularity,PoiRank,Markov,MarkovPath,Rank+Markov,Rank+MarkovPath
Glasgow,0.0,0.044,0.0,0.0025,0.0,0.0,0.0,0.0
Osaka,0.0797,0.0143,0.0861,0.4219,0.007,0.003,0.007,0.003
Toronto,0.0,0.0,0.0,0.5404,0.0,0.0,0.0,0.0


In [16]:
# POI popularity > poi_rank
poi_rank_list = ['PoiRank']

poipop_vs_poirank_df = pd.DataFrame(data=np.zeros((len(cities_list), len(poi_rank_list)), dtype=np.float), \
                       columns=poi_rank_list, index=cities_list)

poi_rank_res = []

# Loop over cities
for i in range(0, len(mean_df.columns)):
    # Loop over methods
    for j in range(0, len(mean_df.index)):
        # poi popularity method
        if (j==3):
            if i in (1,3,4):
                tt=ttest_ind_from_stats(mean1=mean_df.values[j,i], std1=std_df.values[j,i], nobs1=lens_df.values[j,i],
                             mean2=mean_df.values[j+1,i], std2=std_df.values[j+1,i], nobs2=lens_df.values[j+1,i],
                             equal_var=False)
                poi_rank_res.append(round(tt.pvalue,4))                                            
                # print(mean_df.values[j,i],std_df.values[j,i],lens_df.values[j,i],j,i, "|", j,i , "|", tt.pvalue)
            
poipop_vs_poirank_df[poi_rank_list[0]] = poi_rank_res  

poipop_vs_poirank_df.transpose().to_excel(r"data\Significance-Testing\poipop_vs_poirank.xlsx", index = True)

In [17]:
poipop_vs_poirank_df

Unnamed: 0,PoiRank
Glasgow,0.001
Osaka,0.3858
Toronto,0.0


In [18]:
# Rank+Markov > Markov

markov_list = ['Markov']

rankmarkov_vs_markov_df = pd.DataFrame(data=np.zeros((len(cities_list), len(markov_list)), dtype=np.float), \
                           columns=markov_list, index=cities_list)

markov_res = []

for i in range(0, len(mean_df.columns)):
    for j in range(0, len(mean_df.index)):
        # compare random vs others
        if (j==7):
            if i in (1,3,4):
                tt=ttest_ind_from_stats(mean1=mean_df.values[j,i], std1=std_df.values[j,i], nobs1=lens_df.values[j,i],
                             mean2=mean_df.values[j-2,i], std2=std_df.values[j-2,i], nobs2=lens_df.values[j-2,i],
                             equal_var=False)
                markov_res.append(round(tt.pvalue,4))                                            
            
rankmarkov_vs_markov_df[markov_list[0]] = markov_res   
rankmarkov_vs_markov_df.transpose().to_excel(r"data\Significance-Testing\rankmarkov_vs_markov.xlsx", index = True)


In [19]:
rankmarkov_vs_markov_df

Unnamed: 0,Markov
Glasgow,1.0
Osaka,1.0
Toronto,1.0


In [20]:
# Rank+MarkovPath > MarkovPath

markovpath_list = ['MarkovPath']

rankmarkovpath_markovpath_df = pd.DataFrame(data=np.zeros((len(cities_list), len(markovpath_list)), dtype=np.float), \
                       columns=markovpath_list, index=cities_list)

markovpath_res = []

for i in range(0, len(mean_df.columns)):
    for j in range(0, len(mean_df.index)):
        # compare random vs others
        if (j==8):
            if i in (1,3,4):
                tt=ttest_ind_from_stats(mean1=mean_df.values[j,i], std1=std_df.values[j,i], nobs1=lens_df.values[j,i],
                             mean2=mean_df.values[j-2,i], std2=std_df.values[j-2,i], nobs2=lens_df.values[j-2,i],
                             equal_var=False)
                markovpath_res.append(round(tt.pvalue,4))                                            
            
rankmarkovpath_markovpath_df[markovpath_list[0]] = markovpath_res        
rankmarkovpath_markovpath_df.transpose().to_excel(r"data\Significance-Testing\rankmarkovpath_markovpath.xlsx", index = True)

In [21]:
rankmarkovpath_markovpath_df

Unnamed: 0,MarkovPath
Glasgow,1.0
Osaka,1.0
Toronto,0.9944


In [22]:
# PersTour > PersTour-L

perstour_list = ['PersTour-L']

perstour_vs_perstourl_df = pd.DataFrame(data=np.zeros((len(cities_list), len(perstour_list)), dtype=np.float), \
                       columns=perstour_list, index=cities_list)

prestour_res = []

for i in range(0, len(mean_df.columns)):
    for j in range(0, len(mean_df.index)):
        # compare random vs others
        if (j==1):
            if i in (1,3,4):
                tt=ttest_ind_from_stats(mean1=mean_df.values[j,i], std1=std_df.values[j,i], nobs1=lens_df.values[j,i],
                             mean2=mean_df.values[j+1,i], std2=std_df.values[j+1,i], nobs2=lens_df.values[j+1,i],
                             equal_var=False)
                prestour_res.append(round(tt.pvalue,4))                                            
            
perstour_vs_perstourl_df[perstour_list[0]] = prestour_res  
perstour_vs_perstourl_df.transpose().to_excel(r"data\Significance-Testing\perstour_vs_perstourl.xlsx", index = True)

In [23]:
display(perstour_vs_perstourl_df)

Unnamed: 0,PersTour-L
Glasgow,0.0
Osaka,0.9991
Toronto,0.0
