In [1]:
%cd -q data/actr_reco

In [2]:
import pandas as pd
import numpy as np
from collections import namedtuple
from functools import partial
from datetime import datetime

import tqdm
tqdm.tqdm.pandas()

In [3]:
# Configure parameters
CALC_REWARD = False
USE_CONTENT = False

In [4]:
if USE_CONTENT:
    word2vec_file = "../emo_mem_reco/word2vec_100.csv"
    word2vec_df = pd.read_csv(word2vec_file, sep="\t", error_bad_lines=False, warn_bad_lines=False)
    word2vec_df["item"] = list(zip(word2vec_df["track"], word2vec_df["artist"]))
    word2vec_df = word2vec_df.set_index("item").drop(columns=["unique_id", "track_id", "track", "artist_id", "artist"])
    w2v_cols = word2vec_df.columns.tolist()

In [5]:
if CALC_REWARD:
    dur_cols = ["track", "artist", "playcount", "track_listeners", "duration"]
    durations = pd.read_csv("../emo_mem_reco/LFM-2b_track_artist_pc_ls_dur.txt", sep="\t", names=dur_cols)
    durations["item"] = list(zip(durations["track"], durations["artist"]))
    durations["duration_td"] = pd.to_timedelta(durations["duration"], unit="ms")
    durations = durations.set_index("item").drop(columns=["track", "artist", "playcount", "track_listeners"])
    durations.info()

In [6]:
with open("sampled.txt", "r") as f:
    test_users = f.readlines()
    
test_users = [int(user.strip()) for user in test_users]
len(test_users)

300

In [7]:
test_user = test_users[1]
test_user

73151

In [8]:
filename = f"user_split/listening_events_2019_{test_user}.tsv"

In [9]:
!head {filename}

13432	73151	TRAP DEL TERRAPLANISMO	Jaime Altozano	Trap Del Terraplanismo	f	ES	111	2019-01-01 00:50:42
290649	73151	Bienvenido Al Desastre	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:06:11
291548	73151	Almas	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:09:53
292549	73151	Enredados	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:14:16
293272	73151	Involución	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:17:40
293947	73151	Más Que una Leyenda	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:21:02
294675	73151	Arrástrame al Infierno	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:24:30
295383	73151	Vivo De Rodillas	Megara	Siete	f	ES	111	2019-01-01 23:28:01
296016	73151	Cuenta Atrás	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:30:54
296778	73151	El Hombre de Arena	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:34:32


In [10]:
def load_file(filename):
    col_names = ['user', 'track', 'artist', 'album', 'gender', 'country', 'age', 'timestamp']
    events = pd.read_csv(filename, names=col_names,
                        quoting=3, sep="\t", header=None, encoding='utf-8'
                        )
    events["timestamp"] = pd.to_datetime(events["timestamp"])
    return events

events = load_file(filename)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40
...,...,...,...,...,...,...,...,...
133876543,73151,Ticking Bombs,Go Betty Go,Nothing Is More,f,ES,111,2019-12-28 12:36:35
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50


In [11]:
def preprocess(events, use_content=USE_CONTENT, calc_reward=CALC_REWARD):
    events["prev_timestamp"] = events.groupby("user")["timestamp"].shift()
    events["gap"] = events["timestamp"] - events["prev_timestamp"]
    events["new_session"] = events["gap"] > pd.Timedelta("30min")
    events["new_session_int"] = events["new_session"].astype(int)
    events["session"] = events.groupby("user")["new_session_int"].cumsum()
    events["session_duration"] = events.groupby(["user", "session"])["timestamp"].transform(lambda x: x.iloc[-1] - x.iloc[0])
    events["item"] = list(zip(events["track"], events["artist"])) #, events["album"]))
    events["all_pos"] = 1
    
    if use_content:
        events = events.join(word2vec_df, on="item", how="left")
    
    if calc_reward:
        events = events.join(durations, on="item", how="left")
        events["timestamp_end"] = events["timestamp"] + events["duration_td"]
        events["timestamp_start_next"] = events["timestamp"].shift(-1)
        events["play_duration"] = (events["timestamp_start_next"] - events["timestamp"]).dt.seconds * 1000
        events["gap"] = (events["timestamp_start_next"] - events["timestamp_end"]).dt.seconds
        events["min_duration"] = events[["play_duration", "duration"]].min(axis=1)
        events["play_ratio"] = events["min_duration"] / events["duration"]
        
        def reward_function(play_ratio):
            if play_ratio >= 0.66:
                return 1
            elif play_ratio > 0.33 and play_ratio < 0.66:
                return 0
            else:  # play_ratio <= 0.33:
                return -1

        events["reward"] = events["play_ratio"].apply(reward_function)
    
    return events

events = preprocess(events)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp,prev_timestamp,gap,new_session,new_session_int,session,session_duration,item,all_pos
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42,NaT,NaT,False,0,0,0 days 00:00:00,"(TRAP DEL TERRAPLANISMO, Jaime Altozano)",1
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11,2019-01-01 00:50:42,0 days 22:15:29,True,1,1,0 days 02:32:44,"(Bienvenido Al Desastre, Megara)",1
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53,2019-01-01 23:06:11,0 days 00:03:42,False,0,1,0 days 02:32:44,"(Almas, Megara)",1
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16,2019-01-01 23:09:53,0 days 00:04:23,False,0,1,0 days 02:32:44,"(Enredados, Megara)",1
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40,2019-01-01 23:14:16,0 days 00:03:24,False,0,1,0 days 02:32:44,"(Involución, Megara)",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133876543,73151,Ticking Bombs,Go Betty Go,Nothing Is More,f,ES,111,2019-12-28 12:36:35,2019-12-28 12:33:20,0 days 00:03:15,False,0,691,0 days 01:22:08,"(Ticking Bombs, Go Betty Go)",1
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52,2019-12-28 12:36:35,0 days 00:04:17,False,0,691,0 days 01:22:08,"(Sweet Curse, ReVamp)",1
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26,2019-12-28 12:40:52,0 days 00:03:34,False,0,691,0 days 01:22:08,"(Killers Like Candy, I Am Ghost)",1
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50,2019-12-28 12:44:26,0 days 00:07:24,False,0,691,0 days 01:22:08,"(Lucifer's Angel, The Rasmus)",1


In [12]:
%run baseline_models.py
mr = MostRecent()

In [13]:
%run transition_models.py
ubtp = UserBasedTransitionProbability()

In [14]:
%run emomem_model.py
assoc = AssociativeComponent()
valu_allpos = ValuationComponent("AllPos", reward_col="all_pos")

In [15]:
# BLL configured
bll_short = BaseLevelComponent(decay=0.860)

In [16]:
# AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'
actr_w1 = ActrRecommender([valu_allpos, assoc, bll_short], weights=[0.597353  , 4.02501687, 3.62860345], softmax=True, name="LogWithIntercept")
actr_w2 = ActrRecommender([valu_allpos, assoc, bll_short], weights=[-57.39723739,  -6.58720228, -24.80148528], softmax=True, name="LogNoIntercept")
actr_w3 = ActrRecommender([valu_allpos, assoc, bll_short], weights=[-0.04674839,  0.25708863,  0.34852751], softmax=True, name="LinWithIntercept")
actr_w4 = ActrRecommender([valu_allpos, assoc, bll_short], weights=[0.18126342, 0.2597506 , 0.34184948], softmax=True, name="LinNoIntercept")

In [17]:
%run data_splitter.py

In [18]:
algo_list = [
    actr_w1,
    actr_w2,
    actr_w3,
    actr_w4,
]

algo_list

[<__main__.ActrRecommender at 0x7fb531a141f0>,
 <__main__.ActrRecommender at 0x7fb531a14190>,
 <__main__.ActrRecommender at 0x7fb531a14220>,
 <__main__.ActrRecommender at 0x7fb531a141c0>]

In [19]:
def generate_single_reco(algo, train, test, user, split_f):
    expected = test["item"].values.tolist()
    unique_test = test["item"].nunique()
   
    start_time = datetime.now()
    res = algo.recommend(train, unique_test)
    end_time = datetime.now()

    pred = {
        "algo": str(algo),
        "split_f": str(split_f),
        "user": user,
        "pos": len(train),
        "pred": res,
        "pred_len": len(res),
        "exp": expected,
        "unique_exp": unique_test,
        "mu_s": (end_time - start_time).microseconds
    }
    return pred
        
def generate_recomms_user_df(user_df, algo, split_f):
    user = user_df.index[0]  # Assume single user only
    preds = [generate_single_reco(algo, train, test, user, split_f) for train, test in split_f(user_df)]
    return preds

def generate_recomms(algo, events, split_f):
    generate_recomms_user_df_with_splitter = partial(generate_recomms_user_df, algo=algo, split_f=split_f)
    
    preds = events.groupby("user").apply(generate_recomms_user_df_with_splitter)
    pred_df = pd.concat([pd.DataFrame.from_records(x) for x in preds])
    return pred_df

In [None]:
import os

for test_user in tqdm.tqdm(test_users):
    print(test_user)
    filename = f"user_split/listening_events_2019_{test_user}.tsv"
    events = load_file(filename)
    events = preprocess(events)
    for algo in algo_list:
        if os.path.exists(f"preds/LFM-2b_2019_{algo}_topn_preds_{test_user}.csv"):
            print(f"Skipping {test_user} - {algo}")
            continue
        print(f"Predictions for {algo}")
        data_splitter = DataSplitter(Slider(step=1), TrainTimeDelta(pd.Timedelta("7days")), TestRemainingSession(), ValidSessionDuration())
        pred_df = generate_recomms(algo, events, data_splitter)
        pred_df.to_csv(f"preds/LFM-2b_2019_{algo}_topn_preds_{test_user}.csv", header=False)