In [1]:
%cd -q data/actr_reco

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.dpi'] = 200

import glob
import tqdm

In [3]:
fileglob = "preds/LFM-2b_2019_*_topn_preds_*.csv"
all_files = glob.glob(fileglob)

In [4]:
limit = 150
with open("sampled.txt", "r") as f:
    users = f.readlines()
    
users = [int(user.strip()) for user in users]
# limit users
users = users[:limit]
all_files = [file for file in all_files if any(f"preds_{user}" in file for user in users)]
len(users), len(all_files)

(150, 2335)

In [5]:
def next_item_hr(row):
    assert len(row["exp"])
    if not len(row["pred"]):
        return 0
    return 1 if row["exp"][0] == row["pred"][0] else 0

In [6]:
def rprecision(row):
    """Only works for remaining session predictions correctly.
    Not the original implementation due to duplicates."""
    R = len(set(row["exp"]))
    r = len(set(row["pred"]).intersection(set(row["exp"])))
    return r/R

In [7]:
def aggregate(pred_df):
    pred_df["r-precision"] = pred_df.apply(rprecision, axis=1)
    pred_df["next_item_hr"] = pred_df.apply(next_item_hr, axis=1)
    
    agg_df = pred_df.groupby("algo").agg({"r-precision": "sum", "next_item_hr": "sum", "pos": "count", "user": "nunique", "algo": "nunique"})
    return agg_df

In [8]:
# Read first file
def read_file_and_aggregate(filename):
    pred_df = pd.read_csv(filename, names=[
        "algo",
        "split_f",
        "user",
        "pos",
        "pred",
        "pred_len",
        "exp",
        "unique_exp",
        "mu_s"],
        index_col=0)
    pred_df["pred"] = pred_df["pred"].map(eval)
    pred_df["exp"] = pred_df["exp"].map(eval)
    return aggregate(pred_df)

agg_df = read_file_and_aggregate(all_files[0])
agg_df

Unnamed: 0_level_0,r-precision,next_item_hr,pos,user,algo
algo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MostRecent,130.733717,25,6657,1,1


In [None]:
# Read reamining files
for filename in tqdm.tqdm(all_files[1:], bar_format="{l_bar}{bar}{r_bar} %i"):
    try:
        calc_df = read_file_and_aggregate(filename)
    except Exception as e:
        print(e)
        print(filename)
        continue
        
    assert len(calc_df.index) == 1
    # Element-wise aggregation
    if calc_df.index[0] not in agg_df.index:
        agg_df = pd.concat([agg_df, calc_df])
    else:
        agg_df = calc_df.combine(agg_df, lambda s1, s2: s1 + s2, fill_value=0, overwrite=False)
agg_df

In [10]:
agg_df.to_csv("aggregate_results.csv")

In [None]:
agg_df["r-precision"] = agg_df["r-precision"] / agg_df["pos"]
agg_df["next_item_hr"] = agg_df["next_item_hr"] / agg_df["pos"]
agg_df = agg_df.sort_values("r-precision")
agg_df.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))

In [12]:
agg_df.index = agg_df.index.map({
    "Word2Vec100": "Partial Matching",
    "NoiseComponent": "Noise",
    "AssociativeComponent": "Spreading",
    
    "AllPos": "Valuation(MP)",
    "PosNeuNeg": "Valuation(Discrete)",
    "ValueRatio": "Valuation(Ratio)",
    
    "BaseLevelComponent1.737": "Base-level(full)",
    "BaseLevelComponent": "Base-level(default)",
    "BaseLevelComponent0.86": "Base-level(week)",
    
    "MostRecent": "MostRecent",
    "UserBasedTransitionProbability": "TransProb",
    
    "ActrRecommender(AssociativeComponent,AllPos)": "ACT-R(S,V)",
    "ActrRecommender(BaseLevelComponent,AllPos)": "ACT-R(B,V)",
    "ActrRecommender(BaseLevelComponent,AssociativeComponent)": "ACT-R(B,S)",
    "ActrRecommender(BaseLevelComponent,AssociativeComponent,AllPos)": "ACT-R(B,S,V)",
})

In [14]:
print(agg_df[["r-precision", "next_item_hr"]].to_latex(float_format="{:.5f}".format))

\begin{tabular}{lrr}
\toprule
{} &  r-precision &  next\_item\_hr \\
algo                &              &               \\
\midrule
NaN                 &      0.00000 &       0.00000 \\
NaN                 &      0.00083 &       0.00007 \\
NaN                 &      0.00121 &       0.00003 \\
NaN                 &      0.01563 &       0.00093 \\
TransProb           &      0.03839 &       0.15907 \\
Partial Matching    &      0.03895 &       0.01320 \\
Noise               &      0.03996 &       0.00289 \\
Valuation(Discrete) &      0.04751 &       0.00533 \\
Valuation(Ratio)    &      0.05987 &       0.01042 \\
Valuation(MP)       &      0.08436 &       0.01477 \\
NaN                 &      0.08951 &       0.02983 \\
NaN                 &      0.08959 &       0.03046 \\
NaN                 &      0.09128 &       0.03041 \\
Spreading           &      0.09235 &       0.02117 \\
Base-level(full)    &      0.09903 &       0.03200 \\
ACT-R(B,V)          &      0.10069 &       0.02416 \\
Most