In [1]:
%cd -q data/actr_reco

In [2]:
import pandas as pd
import numpy as np
from collections import namedtuple
from functools import partial
from datetime import datetime

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import MinMaxScaler

import tqdm
tqdm.tqdm.pandas()

In [3]:
limit_users_for_estimation = 15

In [4]:
word2vec_file = "../emo_mem_reco/word2vec_100.csv"
word2vec_df = pd.read_csv(word2vec_file, sep="\t", error_bad_lines=False, warn_bad_lines=False)
word2vec_df["item"] = list(zip(word2vec_df["track"], word2vec_df["artist"]))
word2vec_df = word2vec_df.set_index("item").drop(columns=["unique_id", "track_id", "track", "artist_id", "artist"])
w2v_cols = word2vec_df.columns.tolist()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
with open("sampled.txt", "r") as f:
    train_users = f.readlines()
    
train_users = train_users[:150]
train_users = [int(user.strip()) for user in train_users]
len(train_users)

150

In [6]:
train_user = train_users[1]
train_user

73151

In [7]:
filename = f"user_split/listening_events_2019_{train_user}.tsv"

In [8]:
!head {filename}

13432	73151	TRAP DEL TERRAPLANISMO	Jaime Altozano	Trap Del Terraplanismo	f	ES	111	2019-01-01 00:50:42
290649	73151	Bienvenido Al Desastre	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:06:11
291548	73151	Almas	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:09:53
292549	73151	Enredados	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:14:16
293272	73151	Involución	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:17:40
293947	73151	Más Que una Leyenda	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:21:02
294675	73151	Arrástrame al Infierno	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:24:30
295383	73151	Vivo De Rodillas	Megara	Siete	f	ES	111	2019-01-01 23:28:01
296016	73151	Cuenta Atrás	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:30:54
296778	73151	El Hombre de Arena	Megara	Aquí Todos Estamos Locos	f	ES	111	2019-01-01 23:34:32


In [9]:
def load_file(filename):
    col_names = ['user', 'track', 'artist', 'album', 'gender', 'country', 'age', 'timestamp']
    events = pd.read_csv(filename, names=col_names,
                        quoting=3, sep="\t", header=None, encoding='utf-8'
                        )
    events["timestamp"] = pd.to_datetime(events["timestamp"])
    return events

events = load_file(filename)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40
...,...,...,...,...,...,...,...,...
133876543,73151,Ticking Bombs,Go Betty Go,Nothing Is More,f,ES,111,2019-12-28 12:36:35
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50


In [10]:
def preprocess(events):
    events["prev_timestamp"] = events.groupby("user")["timestamp"].shift()
    events["gap"] = events["timestamp"] - events["prev_timestamp"]
    events["new_session"] = events["gap"] > pd.Timedelta("30min")
    events["new_session_int"] = events["new_session"].astype(int)
    events["session"] = events.groupby("user")["new_session_int"].cumsum()
    events["session_duration"] = events.groupby(["user", "session"])["timestamp"].transform(lambda x: x.iloc[-1] - x.iloc[0])
    events["item"] = list(zip(events["track"], events["artist"])) #, events["album"]))
    events["all_pos"] = 1
    events = events.join(word2vec_df, on="item", how="left")
    return events

events = preprocess(events)
events

Unnamed: 0,user,track,artist,album,gender,country,age,timestamp,prev_timestamp,gap,...,(91),(92),(93),(94),(95),(96),(97),(98),(99),(100)
13432,73151,TRAP DEL TERRAPLANISMO,Jaime Altozano,Trap Del Terraplanismo,f,ES,111,2019-01-01 00:50:42,NaT,NaT,...,0.612252,-1.272174,1.746467,-0.242183,-0.505445,1.674694,0.669406,-0.503224,-0.681730,-0.239689
290649,73151,Bienvenido Al Desastre,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:06:11,2019-01-01 00:50:42,0 days 22:15:29,...,,,,,,,,,,
291548,73151,Almas,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:09:53,2019-01-01 23:06:11,0 days 00:03:42,...,,,,,,,,,,
292549,73151,Enredados,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:14:16,2019-01-01 23:09:53,0 days 00:04:23,...,,,,,,,,,,
293272,73151,Involución,Megara,Aquí Todos Estamos Locos,f,ES,111,2019-01-01 23:17:40,2019-01-01 23:14:16,0 days 00:03:24,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133876543,73151,Ticking Bombs,Go Betty Go,Nothing Is More,f,ES,111,2019-12-28 12:36:35,2019-12-28 12:33:20,0 days 00:03:15,...,0.026777,0.425415,0.292512,-0.343020,-0.100077,-0.099067,-0.416285,0.420662,-0.310532,0.516959
133877677,73151,Sweet Curse,ReVamp,ReVamp,f,ES,111,2019-12-28 12:40:52,2019-12-28 12:36:35,0 days 00:04:17,...,-0.113186,-0.006809,0.164201,0.036423,-0.346895,0.240790,-0.415811,-0.101693,-0.129389,0.255491
133878597,73151,Killers Like Candy,I Am Ghost,Lovers' Requiem,f,ES,111,2019-12-28 12:44:26,2019-12-28 12:40:52,0 days 00:03:34,...,0.265503,-0.020215,-0.203386,-0.485565,-0.564553,0.339350,-0.356358,0.183125,0.366338,-0.252615
133880585,73151,Lucifer's Angel,The Rasmus,Hide From The Sun (Bonus Track Version),f,ES,111,2019-12-28 12:51:50,2019-12-28 12:44:26,0 days 00:07:24,...,0.228357,-0.067932,0.309536,-0.130858,0.560277,-0.228062,0.047298,0.079904,0.236593,0.005050


In [11]:
%run emomem_model.py
bll = BaseLevelComponent(decay=0.860)
assoc = AssociativeComponent()
valu_allpos = ValuationComponent("AllPos", reward_col="all_pos")

In [12]:
%run data_splitter.py

In [13]:
algo_list = [
    bll,
    assoc,
    valu_allpos,
]

# Hybrid Estimation

In [14]:
from scipy import special

def generate_single_reco_scores(algo, train, test, user, split_f):
    expected = test["item"].values.tolist()
    unique_test = test["item"].nunique()
    assert unique_test
    unique_test_items = test["item"].unique()
   
    start_time = datetime.now()
    res = algo.score(train)
    end_time = datetime.now()
    
    # We already normalize it here, so only factors need to be predicted
    res = special.softmax(res)

    res_df = res.to_frame(name="score")
    res_df["algo"] = str(algo)
    res_df["user"] = user
    res_df["pos"] = len(train)
    res_df["exp"] = np.where(res_df.index.isin(unique_test_items), 1, 0)
    
    assert len(res_df)
    
    return res_df
        
def generate_scores_user_df(user_df, algo, split_f):
    user = user_df.index[0]  # Assume single user only
    preds = [generate_single_reco_scores(algo, train, test, user, split_f) for train, test in split_f(user_df)]
    
    return preds

def generate_scores(algo, events, split_f):
    generate_scores_user_df_with_splitter = partial(generate_scores_user_df, algo=algo, split_f=split_f)
    
    all_preds = []
    for user in events["user"].unique().tolist():
        user_df = events[events["user"] == user].set_index("user")
        preds = generate_scores_user_df_with_splitter(user_df)
        if preds:
            all_preds.append(pd.concat(preds))
    
    if all_preds:
        pred_df = pd.concat(all_preds)
        return pred_df

In [15]:
for train_user in tqdm.tqdm(train_users[:limit_users_for_estimation]):
    print(train_user)
    filename = f"user_split/listening_events_2019_{train_user}.tsv"
    events = load_file(filename)
    events = preprocess(events)
    all_preds = []
    for algo in algo_list:
        print(f"Predictions for {algo}")
        data_splitter = DataSplitter(Slider(step=1), TrainTimeDelta(pd.Timedelta("7days")), TestRemainingSession(), ValidSessionDuration())
        pred_df = generate_scores(algo, events, data_splitter)
        if pred_df is not None:
            all_preds.append(pred_df)

  0%|          | 0/15 [00:00<?, ?it/s]

103807
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


  7%|▋         | 1/15 [01:05<15:15, 65.37s/it]

73151
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 13%|█▎        | 2/15 [05:45<28:08, 129.88s/it]

61740
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 20%|██        | 3/15 [20:31<1:11:20, 356.74s/it]

37608
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 27%|██▋       | 4/15 [33:38<1:29:01, 485.59s/it]

30387
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 33%|███▎      | 5/15 [35:31<1:02:19, 373.92s/it]

90919
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 40%|████      | 6/15 [49:10<1:16:06, 507.36s/it]

35812
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 47%|████▋     | 7/15 [55:16<1:02:01, 465.19s/it]

28952
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 53%|█████▎    | 8/15 [1:17:45<1:25:11, 730.17s/it]

94584
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 60%|██████    | 9/15 [1:19:02<53:25, 534.17s/it]  

95562
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 67%|██████▋   | 10/15 [1:23:23<37:40, 452.17s/it]

114883
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 73%|███████▎  | 11/15 [1:27:19<25:49, 387.34s/it]

37196
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 80%|████████  | 12/15 [1:29:16<15:18, 306.31s/it]

45773
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 87%|████████▋ | 13/15 [2:14:22<34:12, 1026.21s/it]

69888
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


 93%|█████████▎| 14/15 [2:27:23<15:52, 952.66s/it] 

18188
Predictions for BaseLevelComponent0.86
Predictions for AssociativeComponent
Predictions for AllPos


100%|██████████| 15/15 [2:27:53<00:00, 591.59s/it]


In [16]:
res_df = pd.concat(all_preds)
res_df[res_df["exp"] == 1]

Unnamed: 0_level_0,score,algo,user,pos,exp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(Kylän päässä, Moonsorrow)",0.032844,BaseLevelComponent0.86,18188,42,1
"(Kylän päässä, Moonsorrow)",0.030928,BaseLevelComponent0.86,18188,43,1
"(Kylän päässä, Moonsorrow)",0.029230,BaseLevelComponent0.86,18188,44,1
"(Kylän päässä, Moonsorrow)",0.027700,BaseLevelComponent0.86,18188,45,1
"(Kylän päässä, Moonsorrow)",0.026415,BaseLevelComponent0.86,18188,46,1
...,...,...,...,...,...
"(Pakanajuhla, Moonsorrow)",0.059125,AllPos,18188,33,1
"(Pakanajuhla, Moonsorrow)",0.062877,AllPos,18188,34,1
"(Pakanajuhla, Moonsorrow)",0.066038,AllPos,18188,35,1
"(Pakanajuhla, Moonsorrow)",0.068673,AllPos,18188,36,1


In [17]:
res_df.to_csv("scores.csv")

In [18]:
trans_df = res_df.set_index(["user", "pos"], append=True)
trans_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,algo,exp
item,user,pos,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(Do I Ever, Kensington)",18188,1,1.000000,BaseLevelComponent0.86,0
"(Do I Ever, Kensington)",18188,2,0.429018,BaseLevelComponent0.86,0
"(Streets, Kensington)",18188,2,0.570982,BaseLevelComponent0.86,0
"(Do I Ever, Kensington)",18188,3,0.270686,BaseLevelComponent0.86,0
"(Streets, Kensington)",18188,3,0.323065,BaseLevelComponent0.86,0
...,...,...,...,...,...
"(Ukkosenjumalan poika, Moonsorrow)",18188,37,0.036408,AllPos,0
"(Under the Sun, Korpiklaani)",18188,37,0.036408,AllPos,0
"(Unohduksen lapsi, Moonsorrow)",18188,37,0.036408,AllPos,0
"(With Trees, Korpiklaani)",18188,37,0.036408,AllPos,0


# Dups

In [19]:
# Find dups
dup_df = res_df.set_index(["user", "pos", "algo"], append=True)
dup_df[dup_df.index.duplicated(keep=False)].tail(n=20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,score,exp
item,user,pos,algo,Unnamed: 4_level_1,Unnamed: 5_level_1
"(The Black Hammer, Brymir)",18188,48,AllPos,0.020833,0
"(For Those Who Died, Brymir)",18188,49,AllPos,0.020408,0
"(The Black Hammer, Brymir)",18188,49,AllPos,0.020408,0
"(For Those Who Died, Brymir)",18188,50,AllPos,0.02,0
"(The Black Hammer, Brymir)",18188,50,AllPos,0.02,0
"(Aurinko ja Kuu, Moonsorrow)",18188,26,AllPos,0.039724,0
"(Kylän päässä, Moonsorrow)",18188,26,AllPos,0.039724,0
"(Tyven, Moonsorrow)",18188,26,AllPos,0.039724,0
"(Aurinko ja Kuu, Moonsorrow)",18188,27,AllPos,0.039452,0
"(Kylän päässä, Moonsorrow)",18188,27,AllPos,0.039452,0


In [20]:
# fix dups
X_df = dup_df.copy()
X_df = X_df[~X_df.index.duplicated()].reset_index()

del X_df["exp"]
X_df = X_df.reset_index()

X_df = X_df.pivot(index=["item", "user", "pos"], columns="algo", values="score")
X_df = X_df.fillna(0)
X_df = X_df.sort_index()
X_df

Unnamed: 0_level_0,Unnamed: 1_level_0,algo,AllPos,AssociativeComponent,BaseLevelComponent0.86
item,user,pos,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(1.618, Allegaeon)",18188,57,0.018068,1.993313e-01,0.018526
"(1.618, Allegaeon)",18188,58,0.017747,1.124968e-07,0.017728
"(1.618, Allegaeon)",18188,59,0.017438,5.625798e-08,0.017078
"(1.618, Allegaeon)",18188,60,0.017139,3.750745e-08,0.016485
"(13, The Real McKenzies)",18188,116,0.009211,4.999999e-01,0.022881
...,...,...,...,...,...
"(바보 (Feat. Nafla), 베이빌론)",18188,137,0.012793,7.562935e-06,0.013189
"(바보 (Feat. Nafla), 베이빌론)",18188,138,0.012632,6.482971e-06,0.013110
"(바보 (Feat. Nafla), 베이빌론)",18188,139,0.012474,5.672899e-06,0.012985
"(바보 (Feat. Nafla), 베이빌론)",18188,140,0.012321,5.042783e-06,0.012632


In [21]:
y_df = trans_df["exp"]
y_df = y_df[~y_df.index.duplicated()]
y_df = y_df.sort_index()
y_df

item                      user   pos
(1.618, Allegaeon)        18188  57     0
                                 58     0
                                 59     0
                                 60     0
(13, The Real McKenzies)  18188  116    0
                                       ..
(바보 (Feat. Nafla), 베이빌론)  18188  137    0
                                 138    0
                                 139    0
                                 140    0
                                 141    0
Name: exp, Length: 24997, dtype: int64

# Estimate the parameters

In [22]:
X = X_df.values
y = y_df.values

In [23]:
fitted = LogisticRegression(fit_intercept=True).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([[0.597353  , 4.02501687, 3.62860345]])

In [24]:
# Do not fit an intercept
fitted = LogisticRegression(fit_intercept=False).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([[-57.39723739,  -6.58720228, -24.80148528]])

In [27]:
fitted = LinearRegression(fit_intercept=True, positive=False).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([-0.04674839,  0.25708863,  0.34852751])

In [25]:
fitted = LinearRegression(fit_intercept=False, positive=False).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([0.18126342, 0.2597506 , 0.34184948])

In [26]:
fitted = LinearRegression(fit_intercept=False, positive=True).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([0.18126342, 0.2597506 , 0.34184948])

In [28]:
fitted = LinearRegression(fit_intercept=True, positive=True).fit(X, y)
print(X_df.columns)
fitted.coef_

Index(['AllPos', 'AssociativeComponent', 'BaseLevelComponent0.86'], dtype='object', name='algo')


array([0.        , 0.25425998, 0.34237555])