In [None]:
%cd -q data/actr_reco

In [2]:
import pandas as pd
import numpy as np
from collections import namedtuple
from functools import partial
from datetime import datetime

import tqdm
tqdm.tqdm.pandas()

In [3]:
# Configure parameters
CALC_REWARD = True
USE_CONTENT = True

In [4]:
if USE_CONTENT:
    word2vec_file = "../emo_mem_reco/word2vec_100.csv"
    word2vec_df = pd.read_csv(word2vec_file, sep="\t", error_bad_lines=False, warn_bad_lines=False)
    word2vec_df["item"] = list(zip(word2vec_df["track"], word2vec_df["artist"]))
    word2vec_df = word2vec_df.set_index("item").drop(columns=["unique_id", "track_id", "track", "artist_id", "artist"])
    w2v_cols = word2vec_df.columns.tolist()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
if CALC_REWARD:
    dur_cols = ["track", "artist", "playcount", "track_listeners", "duration"]
    durations = pd.read_csv("../emo_mem_reco/LFM-2b_track_artist_pc_ls_dur.txt", sep="\t", names=dur_cols)
    durations["item"] = list(zip(durations["track"], durations["artist"]))
    durations["duration_td"] = pd.to_timedelta(durations["duration"], unit="ms")
    durations = durations.set_index("item").drop(columns=["track", "artist", "playcount", "track_listeners"])
    durations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2495639 entries, ('Somebody That I Used to Know', 'Gotye') to ('Venus (12" Hellfire Mix)', 'Bananarama')
Data columns (total 2 columns):
 #   Column       Dtype          
---  ------       -----          
 0   duration     int64          
 1   duration_td  timedelta64[ns]
dtypes: int64(1), timedelta64[ns](1)
memory usage: 57.1+ MB


In [6]:
with open("sampled.txt", "r") as f:
    test_users = f.readlines()
    
test_users = [int(user.strip()) for user in test_users]
len(test_users)

300

In [7]:
test_user = test_users[1]
test_user

73151

In [8]:
filename = f"user_split/listening_events_2019_{test_user}.tsv"

In [9]:
!head {filename}

13432	73151	TRAP DEL TERRAPLANISMO	Jaime Altozano	Trap Del Terraplanismo	f	ES	111	2019-01-01 00:50:42
290649	73151	Bienvenido Al Desastre	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:06:11
291548	73151	Almas	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:09:53
292549	73151	Enredados	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:14:16
293272	73151	Involución	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:17:40
293947	73151	Más Que una Leyenda	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:21:02
294675	73151	Arrástrame al Infierno	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:24:30
295383	73151	Vivo De Rodillas	Megara	Siete	f	ES	111	2019-01-01 23:28:01
296016	73151	Cuenta Atrás	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:30:54
296778	73151	El Hombre de Arena	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:34:32


In [10]:
def load_file(filename):
    col_names = ['user', 'track', 'artist', 'album', 'gender', 'country', 'age', 'timestamp']
    events = pd.read_csv(filename, names=col_names,
                        quoting=3, sep="\t", header=None, encoding='utf-8'
                        )
    events["timestamp"] = pd.to_datetime(events["timestamp"])
    return events

events = load_file(filename)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40
...,...,...,...,...,...,...,...,...
133876543,73151,Ticking Bombs,Go Betty Go,Nothing Is More,f,ES,111,2019-12-28 12:36:35
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50


In [11]:
def preprocess(events, use_content=USE_CONTENT, calc_reward=CALC_REWARD):
    events["prev_timestamp"] = events.groupby("user")["timestamp"].shift()
    events["gap"] = events["timestamp"] - events["prev_timestamp"]
    events["new_session"] = events["gap"] > pd.Timedelta("30min")
    events["new_session_int"] = events["new_session"].astype(int)
    events["session"] = events.groupby("user")["new_session_int"].cumsum()
    events["session_duration"] = events.groupby(["user", "session"])["timestamp"].transform(lambda x: x.iloc[-1] - x.iloc[0])
    events["item"] = list(zip(events["track"], events["artist"])) #, events["album"]))
    events["all_pos"] = 1
    
    if use_content:
        events = events.join(word2vec_df, on="item", how="left")
    
    if calc_reward:
        events = events.join(durations, on="item", how="left")
        events["timestamp_end"] = events["timestamp"] + events["duration_td"]
        events["timestamp_start_next"] = events["timestamp"].shift(-1)
        events["play_duration"] = (events["timestamp_start_next"] - events["timestamp"]).dt.seconds * 1000
        events["gap"] = (events["timestamp_start_next"] - events["timestamp_end"]).dt.seconds
        events["min_duration"] = events[["play_duration", "duration"]].min(axis=1)
        events["play_ratio"] = events["min_duration"] / events["duration"]
        
        def reward_function(play_ratio):
            if play_ratio >= 0.66:
                return 1
            elif play_ratio > 0.33 and play_ratio < 0.66:
                return 0
            else:  # play_ratio <= 0.33:
                return -1

        events["reward"] = events["play_ratio"].apply(reward_function)
    
    return events

events = preprocess(events)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp,prev_timestamp,gap,...,(99),(100),duration,duration_td,timestamp_end,timestamp_start_next,play_duration,min_duration,play_ratio,reward
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42,NaT,79943.0,...,-0.681730,-0.239689,186000.0,0 days 00:03:06,2019-01-01 00:53:48,2019-01-01 23:06:11,80129000.0,186000.0,1.000000,1
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11,2019-01-01 00:50:42,,...,,,,NaT,NaT,2019-01-01 23:09:53,222000.0,222000.0,,-1
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53,2019-01-01 23:06:11,,...,,,,NaT,NaT,2019-01-01 23:14:16,263000.0,263000.0,,-1
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16,2019-01-01 23:09:53,,...,,,,NaT,NaT,2019-01-01 23:17:40,204000.0,204000.0,,-1
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40,2019-01-01 23:14:16,,...,,,,NaT,NaT,2019-01-01 23:21:02,202000.0,202000.0,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52,2019-12-28 12:36:35,86358.0,...,-0.129389,0.255491,256000.0,0 days 00:04:16,2019-12-28 12:45:08,2019-12-28 12:44:26,214000.0,214000.0,0.835938,1
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26,2019-12-28 12:40:52,231.0,...,0.366338,-0.252615,213000.0,0 days 00:03:33,2019-12-28 12:47:59,2019-12-28 12:51:50,444000.0,213000.0,1.000000,1
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50,2019-12-28 12:44:26,86159.0,...,0.236593,0.005050,241000.0,0 days 00:04:01,2019-12-28 12:55:51,2019-12-28 12:51:50,0.0,0.0,0.000000,-1
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50,2019-12-28 12:44:26,428.0,...,0.236593,0.005050,241000.0,0 days 00:04:01,2019-12-28 12:55:51,2019-12-28 13:02:59,669000.0,241000.0,1.000000,1


In [12]:
%run baseline_models.py
mr = MostRecent()

In [13]:
%run transition_models.py
ubtp = UserBasedTransitionProbability()

In [14]:
%run emomem_model.py
bll = BaseLevelComponent(decay=0.5)
assoc = AssociativeComponent()
word2vec = PartialMatchingComponent("Word2Vec" + str(len(w2v_cols)), feature_cols=w2v_cols)
valu_allpos = ValuationComponent("AllPos", reward_col="all_pos")
noise = NoiseComponent()

In [15]:
# BLL configured
bll_short = BaseLevelComponent(decay=0.860)
bll_long = BaseLevelComponent(decay=1.737)

In [16]:
# Other models

valu_posneg = ValuationComponent("PosNeuNeg", reward_col="reward")
valu_ratio = ValuationComponent("ValueRatio", reward_col="play_ratio")

actr_bav = ActrRecommender([bll, assoc, valu_allpos], weights=[1, 1, 1], softmax=True)
actr_bv = ActrRecommender([bll, valu_allpos], weights=[1, 1, 1], softmax=True)
actr_ba = ActrRecommender([bll, assoc], weights=[1, 1], softmax=True)
actr_av = ActrRecommender([assoc, valu_allpos], weights=[1, 1], softmax=True)

In [17]:
%run data_splitter.py

In [18]:
algo_list = [
    mr,
    ubtp,
    bll,
    assoc,
    word2vec,
    valu_allpos,
    noise,
    
    valu_posneg,
    valu_ratio,
    
    bll_short,
    bll_long,
    
    actr_bav,
    actr_bv,
    actr_ba,
    actr_av,
]

algo_list

[<__main__.MostRecent at 0x7f846c361cd0>,
 <__main__.UserBasedTransitionProbability at 0x7f846c361dc0>,
 <__main__.BaseLevelComponent at 0x7f846c237610>,
 <__main__.AssociativeComponent at 0x7f846c237790>,
 <__main__.PartialMatchingComponent at 0x7f846c361f10>,
 <__main__.ValuationComponent at 0x7f846c2377f0>,
 <__main__.NoiseComponent at 0x7f846c2376a0>,
 <__main__.ValuationComponent at 0x7f8464ba2c40>,
 <__main__.ValuationComponent at 0x7f8464ba2ee0>,
 <__main__.BaseLevelComponent at 0x7f846c237460>,
 <__main__.BaseLevelComponent at 0x7f846c2373d0>,
 <__main__.ActrRecommender at 0x7f8464ba2cd0>,
 <__main__.ActrRecommender at 0x7f8464ba2fa0>,
 <__main__.ActrRecommender at 0x7f8464ba2e80>,
 <__main__.ActrRecommender at 0x7f846c22eca0>]

In [19]:
def generate_single_reco(algo, train, test, user, split_f):
    expected = test["item"].values.tolist()
    unique_test = test["item"].nunique()
   
    start_time = datetime.now()
    res = algo.recommend(train, unique_test)
    end_time = datetime.now()

    pred = {
        "algo": str(algo),
        "split_f": str(split_f),
        "user": user,
        "pos": len(train),
        "pred": res,
        "pred_len": len(res),
        "exp": expected,
        "unique_exp": unique_test,
        "mu_s": (end_time - start_time).microseconds
    }
    return pred
        
def generate_recomms_user_df(user_df, algo, split_f):
    user = user_df.index[0]  # Assume single user only
    preds = [generate_single_reco(algo, train, test, user, split_f) for train, test in split_f(user_df)]
    return preds

def generate_recomms(algo, events, split_f):
    generate_recomms_user_df_with_splitter = partial(generate_recomms_user_df, algo=algo, split_f=split_f)
    
    preds = events.groupby("user").apply(generate_recomms_user_df_with_splitter)
    pred_df = pd.concat([pd.DataFrame.from_records(x) for x in preds])
    return pred_df

In [None]:
import os

for test_user in tqdm.tqdm(test_users):
    filename = f"user_split/listening_events_2019_{test_user}.tsv"
    
    # Shortcute if all path exists
    existing_paths = [os.path.exists(f"preds/LFM-2b_2019_{algo}_topn_preds_{test_user}.csv") for algo in algo_list]
    if all(existing_paths):
        print(f"Skipping {test_user}")
        continue
    
    print(test_user)
    events = load_file(filename)
    events = preprocess(events)
    for algo in algo_list:
        if os.path.exists(f"preds/LFM-2b_2019_{algo}_topn_preds_{test_user}.csv"):
            print(f"Skipping {test_user} - {algo}")
            continue
        print(f"Predictions for {algo}")
        data_splitter = DataSplitter(Slider(step=1), TrainTimeDelta(pd.Timedelta("7days")), TestRemainingSession(), ValidSessionDuration())
        pred_df = generate_recomms(algo, events, data_splitter)
        pred_df.to_csv(f"preds/LFM-2b_2019_{algo}_topn_preds_{test_user}.csv", header=False)