In [4]:
import numpy as np
import pandas as pd
import scipy as sp
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('../datasets/clean_df.csv', compression='zip')

In [3]:
# Convert user id and song id to numerical ids
df['user_id'] = df['user'].astype('category').cat.codes
df['song_id'] = df['song'].astype('category').cat.codes

df_orig = df.copy()

# Create lookup frame so we can get the 'Song - Artist' later
item_lookup = df[['song_id','Song - Artist']].drop_duplicates()
item_lookup['song_id'] = item_lookup['song_id'].astype(str)

# Drop 'user' and 'song' and 'Song - Artist'
df = df.drop(['user','song','Song - Artist'], axis=1)

# Create lists of all users, songs, and counts
users = list(np.sort(df['user_id'].unique()))
songs = list(np.sort(df['song_id'].unique()))
play_counts = list(np.sort(df['count']))

### Train/test split

In [5]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)

(618128, 3)
(154533, 3)


### Building triplets

In [67]:
%%time

data = []
n_random_samples = 2

uusers_train = list(X_train['user_id'].unique())

for user in tqdm( uusers_train ):
#     if ( iterations % 10_000 == 0 ):
#         print(f'{iterations} users processed.')
    
    listened = X_train[X_train['user_id'] == user]['song_id'].values
    
    for i in listened:
        cnt = 0
        while ( cnt < n_random_samples ):
            j = X_train.sample(1).iloc[0,2]
            if ( j not in listened ):
                data.append({
                    'u' : user,
                    'i' : i,
                    'j' : j
                })
                cnt += 1

ttriplets = pd.DataFrame(data, columns=['u', 'i', 'j'])

100%|██████████| 365111/365111 [10:41:07<00:00,  9.49it/s]      


CPU times: user 5h 32min 9s, sys: 9min 31s, total: 5h 41min 41s
Wall time: 10h 41min 11s


In [120]:
ttriplets.to_pickle('../datasets/ttriplets01.csv', compression='zip')

### BPR NN

In [69]:
nusers = len(users)
nsongs = len(songs)

In [70]:
@tf.function
def identity_loss(_, y_pred):
    return tf.math.reduce_mean(y_pred)

@tf.function
def bpr_triplet_loss(X: dict):
    """
    Calculate triplet loss - as higher the difference between positive interactions
    and negative interactions as better

    :param X: X contains the user input, positive item input, negative item input
    :return:
    """
    positive_item_latent, negative_item_latent, user_latent = X

    positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)
    negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)

    return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))

In [71]:
def out_shape(shapes):
    return shapes[0]
    
def build_model(num_users: int, num_items: int, latent_dim: int) -> Model:
    """
    Build a model for Bayesian personalized ranking

    :param num_users: a number of the unique users
    :param num_items: a number of the unique movies
    :param latent_dim: vector length for the latent representation
    :return: Model
    """
    user_input = Input((1,), name='user_input')

    positive_item_input = Input((1,), name='positive_item_input')
    negative_item_input = Input((1,), name='negative_item_input')
    # One embedding layer is shared between positive and negative items
    item_embedding_layer = Embedding(num_items, latent_dim, name='item_embedding', input_length=1)

    positive_item_embedding = Flatten()(item_embedding_layer(positive_item_input))
    negative_item_embedding = Flatten()(item_embedding_layer(negative_item_input))

    user_embedding = Embedding(num_users, latent_dim, name='user_embedding', input_length=1)(user_input)
    user_embedding = Flatten()(user_embedding)

    triplet_loss = Lambda(bpr_triplet_loss, output_shape=out_shape)([positive_item_embedding,
                                                             negative_item_embedding,
                                                             user_embedding])

    model = Model(inputs=[positive_item_input, negative_item_input, user_input], outputs=triplet_loss)

    return model

In [72]:
def bpr_predict(model: Model, user_id: int, song_ids: list, user_layer='user_embedding', item_layer='item_embedding'):
    user_vector = model.get_layer(user_layer).get_weights()[0][user_id]
    item_matrix = model.get_layer(item_layer).get_weights()[0][song_ids]
    
    scores = np.dot(user_vector, item_matrix.T)
    
    return scores

In [80]:
#---------------
#  HYPERPARAMS
#---------------
latent_dim = 350
batch_size = 256
epochs = 3
lr = 0.01

In [82]:
model = build_model(nusers, nsongs, latent_dim=latent_dim)
model.compile(loss=identity_loss, optimizer=Adam(learning_rate=lr))

In [83]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
positive_item_input (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
negative_item_input (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 350)       1286250     positive_item_input[0][0]        
                                                                 negative_item_input[0][0]  

In [84]:
ttriplets.shape

(1236256, 3)

In [87]:
%%time

sample_size = 10_000

striplets = ttriplets.iloc[:sample_size]

X = {
    'user_input': tf.convert_to_tensor(striplets['u']),
    'positive_item_input': tf.convert_to_tensor(striplets['i']),
    'negative_item_input': tf.convert_to_tensor(striplets['j'])
}

hist = model.fit(X, tf.ones(striplets.shape[0]), batch_size=batch_size,verbose=1,epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 5min 51s, sys: 4min 59s, total: 10min 50s
Wall time: 6min 10s


In [105]:
uweights = model.get_layer('user_embedding').get_weights()
iweights = model.get_layer('item_embedding').get_weights()

CPU times: user 502 ms, sys: 1.25 s, total: 1.75 s
Wall time: 2.52 s


In [103]:
iweights[0][songs].T.shape

(350, 3675)

In [106]:
%%time
dot = np.dot(uweights[0][0], iweights[0][songs].T)

CPU times: user 3.4 ms, sys: 3.71 ms, total: 7.11 ms
Wall time: 5.63 ms


In [118]:
grnd = np.zeros(nsongs, dtype=np.int32)

In [119]:
grnd

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [112]:
np.sort(dot, ascending=False)

TypeError: _sort_dispatcher() got an unexpected keyword argument 'ascending'

In [116]:
np.sort(dot)[::-1][:10]

array([0.2685007 , 0.26486716, 0.25799644, 0.25115645, 0.24870834,
       0.23784854, 0.22005787, 0.20737101, 0.20207833, 0.1917183 ],
      dtype=float32)