In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

import matplotlib.pyplot as plt
%matplotlib inline

In [167]:
df = pd.read_pickle('../datasets/clean_df.csv', compression='zip')

In [168]:
df_orig = df.copy()

In [169]:
# Convert user id and song id to numerical ids
df['user_id'] = df['user'].astype('category').cat.codes
df['song_id'] = df['song'].astype('category').cat.codes

df_orig2 = df.copy()

# Create lookup frame so we can get the 'Song - Artist' later
item_lookup = df[['song_id','Song - Artist']].drop_duplicates()
item_lookup['song_id'] = item_lookup['song_id'].astype(str)

# Drop 'user' and 'song' and 'Song - Artist'
df = df.drop(['user','song','Song - Artist'], axis=1)

df = df.sort_values(by=['user_id', 'song_id']).iloc[:100_000]

# Create lists of all unique users, songs
uusers = list(np.sort(df['user_id'].unique()))
usongs = list(np.sort(df['song_id'].unique()))
nusers = len(uusers)
nsongs = len(usongs)

In [170]:
uusers_orig2 = list(np.sort(df_orig2['user_id'].unique()))
usongs_orig2 = list(np.sort(df_orig2['song_id'].unique()))
nusers_orig2 = len(uusers_orig2)
nsongs_orig2 = len(usongs_orig2)

In [171]:
df.head()

Unnamed: 0,count,user_id,song_id
513283,1,0,1812
366608,4,1,2370
296152,1,1,2818
79948,1,1,3548
728500,1,2,2097


In [172]:
print(len(uusers))
print(len(usongs))

54209
2741


### Train/Validation split

- I'm struggling to determine whether or not I should split up the training data into train/test data when I'm training my model

I don't think so

- What am I supposed to do?

I think you're just supposed to just withhold certain observations that we know are positive. As a matter of fact, each observation in BPR is treated as positive if it's present, negative if it's not. In this case, we know that all observations all positive. We can neglect song count.

- So how should this be done?

For each user that has 4 or more observations, withhold an observation from the training dataset. This observation will get treated as missing during training, and our model will attempt to fill it in. Then, on evaluation, for each user we have withheld data from, we will see if the recommender ranks the song withheld in the top 10 recommendations.


#### Validation hold-out

In [16]:
tmp_val = (
        df.groupby('user_id')['song_id'].count()
    ).reset_index().rename({'song_id':'records'}, axis=1)
tmp_val.shape

(54209, 2)

In [20]:
min_records = 3
conditions = df['user_id'].isin(tmp_val[tmp_val['records'] > min_records].user_id)
df_val = df[conditions].groupby('user_id').head(1).reset_index()
del df_val['index']
df_val.shape

(5359, 3)

In [37]:
ground_truth = { row.user_id : row.song_id for _ , row in df_val.iterrows() }

In [38]:
df_train = pd.concat([df, df_val]).drop_duplicates(keep=False)

Lookup tables to create:
- nuser_listens = \{ user: \[each song user has listened to\] \}
- nuser_recs = \{ user: \[rec songs for user\] \}

In [173]:
agged = df.groupby('user_id')['song_id'].agg(list).reset_index()

In [174]:
agged.head()

Unnamed: 0,user_id,song_id
0,0,[1812]
1,1,"[2370, 2818, 3548]"
2,2,[2097]
3,3,[1102]
4,4,"[366, 1894, 2580]"


In [175]:
nuser_listens = {row['user_id'] : row['song_id'] for _ , row in agged.iterrows()}

## Building Triplets

In [40]:
%%time

data = []
n_random_samples = 2

uusers_train = list(df_train['user_id'].unique())

for user in tqdm( uusers_train ):
    listened = df_train[df_train['user_id'] == user]['song_id'].values
    
    for i in listened:
        cnt = 0
        while ( cnt < n_random_samples ):
            j = df_train.sample(1).iloc[0,2]
            if ( j not in listened ):
                data.append({
                    'u' : user,
                    'i' : i,
                    'j' : j
                })
                cnt += 1

ttriplets = pd.DataFrame(data, columns=['u', 'i', 'j'])

100%|██████████| 54209/54209 [09:30<00:00, 95.01it/s] 


CPU times: user 8min 32s, sys: 14.2 s, total: 8min 46s
Wall time: 9min 30s


### Build model

In [63]:
@tf.function
def identity_loss(_, y_pred):
    return tf.math.reduce_mean(y_pred)

@tf.function
def triplet_loss(X: dict):
    i_latent , j_latent , u_latent = X
    pos_interactions = tf.math.reduce_sum(tf.math.multiply(u_latent, i_latent), axis=-1, keepdims=True)
    neg_interactions = tf.math.reduce_sum(tf.math.multiply(u_latent, j_latent), axis=-1, keepdims=True)
    return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(pos_interactions, neg_interactions)))

def out_shape(shapes):
    return shapes[0]

In [54]:
def build_model(num_users: int, num_items: int, latent_dim: int) -> Model:
    u_input = Input((1,), name='user_input')
    i_input = Input((1,), name='positive_item_input')
    j_input = Input((1,), name='negative_item_input')
    
    item_emb = Embedding(num_items, latent_dim, name='item_embedding', input_length=1)
    i_emb = Flatten()( item_emb(i_input) )
    j_emb = Flatten()( item_emb(j_input) )
    
    u_emb = Embedding(num_users, latent_dim, name='user_embedding', input_length=1)(u_input)
    u_emb = Flatten()(u_emb)
    
    t_loss = Lambda(triplet_loss, output_shape=out_shape, name='triplet_loss')([i_emb, j_emb, u_emb])
    
    model = Model(inputs=[i_input, j_input, u_input], outputs=t_loss)
    
    return model

In [73]:
#---------------
#  HYPERPARAMS
#---------------
latent_dim = 350
batch_size = 512
epochs = 1
lr = 0.01

In [74]:
model = build_model(nusers, nsongs, latent_dim)
model.compile(loss=identity_loss, optimizer=Adam(learning_rate=lr))
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
positive_item_input (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
negative_item_input (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 350)       1286250     positive_item_input[0][0]        
                                                                 negative_item_input[0][0]  

In [75]:
%%time

X = {
    'user_input': tf.convert_to_tensor(ttriplets.u),
    'positive_item_input': tf.convert_to_tensor(ttriplets.i),
    'negative_item_input': tf.convert_to_tensor(ttriplets.j)
}

hist = model.fit(X, 
                 tf.ones(ttriplets.shape[0]),
                 batch_size=batch_size, 
                 verbose=1, 
                 epochs=epochs)

CPU times: user 18min 12s, sys: 14min 53s, total: 33min 6s
Wall time: 18min 21s


### Evaluation

In [211]:
def get_weights(model: Model, song_ids: list):
    u = model.get_layer('user_embedding').get_weights()[0]
    i = model.get_layer('item_embedding').get_weights()[0][song_ids]
    return u, i

def get_topk(model: Model, user: int, usongs=usongs, weights=weights, k=10):
    user_weights, item_weights = weights
    
    user_vector = user_weights[user]
    item_matrix = item_weights
    
    scores = np.dot(user_vector, item_matrix.T)
    
    predictions = dict(zip(usongs,scores))
    predictions = sorted(predictions.items(), key=lambda kv:kv[1], reverse=True)[:k]
    predictions = list(OrderedDict(predictions).keys())
    
    return predictions

In [229]:
def get_hits(user_listens, user_recs):
    hits = 0
    for song in user_listens:
        if ( song in user_recs ):
            hits += 1
    return hits

def get_metrics(uusers: list, nuser_listens: dict, nuser_recs: dict, N: int, k=10):
    nusers = len(uusers)
    precision = recall = t_hits = 0
    
    for user in tqdm(uusers, desc='Getting metrics... '):
        hits = get_hits(user_listens=nuser_listens[user], user_recs=nuser_recs[user])
        precision += hits / len(nuser_listens[user])
        recall += hits / k
        t_hits += hits
                                
    precision = precision / nusers
    recall = recall / nusers
    hit_ratio = t_hits / N
    
    return precision, recall, hit_ratio

def hold_out_hit_ratio(ground_truth: dict, nuser_recs: dict):
    hits = 0
    for _ , (user , song) in enumerate(ground_truth.items()):
        if song in nuser_recs[user]:
            hits += 0
    return hits / len(ground_truth)

In [231]:
def evaluate(model: Model, uusers: list, usongs: list, nuser_listens: dict, N: int, ground_truth, k=10):
    weights = get_weights(model, usongs)
    nuser_recs = {user : get_topk(model, user) for user in tqdm(uusers, desc='Generating recommendations for all users... ')}
    p , r , hr = get_metrics(uusers, nuser_listens, nuser_recs, N)
    hoh = eval_hold_out(ground_truth, nuser_recs)
        
    evals = {
        'precision' : p,
        'recall' : r,
        'hit_ratio' : hr,
        'hold_out_hits': hoh
    }
    return evals

In [232]:
evals = evaluate(model, uusers, usongs, nuser_listens, df_train.shape[0])

Generating recommendations for all users... : 100%|██████████| 54209/54209 [02:17<00:00, 394.13it/s]
Getting metrics... : 100%|██████████| 54209/54209 [00:02<00:00, 26430.77it/s]


AttributeError: 'NoneType' object has no attribute 'items'

In [145]:
weights = get_weights(model, usongs)
preed = get_topk(model, 12)

In [204]:
nuser_recs = {user : get_topk(model, user) for user in tqdm(uusers, desc='Generating recommendations for all users... ')}

Generating recommendations for all users... : 100%|██████████| 54209/54209 [02:14<00:00, 402.22it/s]


In [201]:
get_metrics(uusers, nuser_listens, nuser_recs, N=df_train.shape[0])

Getting metrics... : 100%|██████████| 54209/54209 [00:01<00:00, 30104.00it/s]


(0.002339740891837189, 0.00043166263904517813, 0.002472501347196247)

Lookup tables to create:
- nuser_listens = \{ user: \[each song user has listened to\] \}
- nuser_recs = \{ user: \[rec songs for user\] \}

In [227]:
e = eval_hold_out(ground_truth, nuser_recs)

In [None]:
def recall(uusers, nuser_listens, nuser_recs, k=10):
    # for each user in the dataset
        # for each of the top 10 recs
                # if rec is a hit, add 1
            # user_recall = number of hits / 10
            # total_recall += user_recall
        # total_recall = total_recall / nusers
    recall = 0
    for user in uusers:
        recall += get_hits(nuser_listens[user], nuser_recs[user]) / k
    return recall / len(uusers)
    
def user_recall(user_listens, user_recs, k=10):
    hits = 0
    for song in user_listens:
        if song is in user_recs:
            hits += 1
    return hits / k

def get_hits(user_listens, user_recs):
    for song in user_listens:
        if song is in user_recs:
            hits += 1
    return hits

In [None]:
def precision():
    precision = 0
    for user in uusers:
        precision += get_hits(nuser_listens[user], nuser_recs[user]) / len(nuser_listens[user])
    return precision / len(uusers)
    # for each user in the dataset
        # for each song the user listened to
            # if the song is in the top 10 recs, add 1
        # user_precision = number of hits / number of songs the user listened to
        # total_precision += user_precision
    # total_precision = total_precision / nusers

In [None]:
def hit_ratio():
    hits = 0
    for user in uusers:
        hits += get_hits(nuser_listens[user], nuser_recs[user])
    hit_ratio = hits / N
    # for each user in the dataset
        # for each song the user listened to
            # if the song is in the top 10 recs, add 1
    # divide total by number of observations 

In [146]:
preed

[370, 868, 3374, 1192, 2529, 1252, 2738, 1036, 201, 157]

In [137]:
user_weights , item_weights = get_weights(model, usongs)

In [138]:
scores = get_scores(model, 12, usongs, user_weights, item_weights)

In [139]:
preds = k_predictions(scores)

In [140]:
preds

[370, 868, 3374, 1192, 2529, 1252, 2738, 1036, 201, 157]

In [119]:
scores = get_scores(model, 
                    12, 
                    usongs, 
                    user_weights=user_weights,
                    item_weights=item_weights)

In [120]:
predictions = dict(zip(usongs, scores))
predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:10]
predictions

[(370, 9.333147),
 (868, 7.260547),
 (3374, 5.424232),
 (1192, 5.2523985),
 (2529, 4.787279),
 (1252, 3.287215),
 (2738, 3.0342348),
 (1036, 2.930763),
 (201, 2.80232),
 (157, 2.6922357)]

In [121]:
predictions = list(OrderedDict(predictions).keys())
predictions

[370, 868, 3374, 1192, 2529, 1252, 2738, 1036, 201, 157]

In [112]:
pred = {user:conf for user, conf in predictions}

In [113]:
pred[114]

0.09110445

In [104]:
predictions = list(OrderedDict(predictions).keys())

In [105]:
predictions

[370,
 868,
 3374,
 1192,
 2529,
 1252,
 2738,
 1036,
 201,
 157,
 1853,
 315,
 2313,
 399,
 2551,
 2050,
 92,
 2769,
 2353,
 449,
 724,
 3520,
 376,
 3561,
 112,
 2629,
 435,
 3015,
 365,
 434,
 3263,
 2859,
 1233,
 3410,
 2443,
 3443,
 1140,
 1594,
 1767,
 757,
 1146,
 3172,
 2865,
 1304,
 2164,
 3440,
 1098,
 1698,
 150,
 2608,
 3591,
 1049,
 570,
 1117,
 3399,
 1100,
 3007,
 2725,
 3265,
 1414,
 3118,
 1011,
 189,
 2761,
 3564,
 994,
 1313,
 562,
 311,
 3464,
 3202,
 2119,
 1367,
 1821,
 1161,
 3077,
 2367,
 849,
 2986,
 2144,
 2306,
 639,
 218,
 1707,
 2502,
 3084,
 1360,
 860,
 551,
 1835,
 2433,
 910,
 144,
 2544,
 2385,
 238,
 2898,
 1195,
 1715,
 1389,
 2727,
 1947,
 3517,
 3565,
 1446,
 890,
 1452,
 2205,
 180,
 3091,
 64,
 1156,
 1226,
 1166,
 1063,
 517,
 1065,
 1111,
 1938,
 2916,
 541,
 3199,
 2112,
 3292,
 802,
 2795,
 2308,
 3110,
 3546,
 2312,
 2532,
 2317,
 1246,
 223,
 1251,
 2741,
 2968,
 1027,
 1351,
 1790,
 1930,
 504,
 1122,
 3631,
 3213,
 300,
 1319,
 1474,
 969

- Calculate hit ratio, precision, recall for training data
- For val holdouts - determine number it correctly recommended in top 10