# BPR Triplet on ML-1m in Tensorflow

In [None]:
!pip install -q tensorflow_addons

[?25l[K     |▎                               | 10 kB 21.1 MB/s eta 0:00:01[K     |▋                               | 20 kB 27.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 32.1 MB/s eta 0:00:01[K     |█▏                              | 40 kB 34.7 MB/s eta 0:00:01[K     |█▌                              | 51 kB 33.3 MB/s eta 0:00:01[K     |█▊                              | 61 kB 34.2 MB/s eta 0:00:01[K     |██                              | 71 kB 26.7 MB/s eta 0:00:01[K     |██▍                             | 81 kB 25.9 MB/s eta 0:00:01[K     |██▋                             | 92 kB 26.5 MB/s eta 0:00:01[K     |███                             | 102 kB 28.4 MB/s eta 0:00:01[K     |███▎                            | 112 kB 28.4 MB/s eta 0:00:01[K     |███▌                            | 122 kB 28.4 MB/s eta 0:00:01[K     |███▉                            | 133 kB 28.4 MB/s eta 0:00:01[K     |████▏                           | 143 kB 28.4 MB/s eta 0:

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

import os
import heapq  
import gc
from tqdm import tqdm
import random

from tensorflow import keras
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply, Dot
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

In [None]:
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

### Utils

In [None]:
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [None]:
def load_data(filepath):
    df = pd.read_csv(filepath,
                     sep="::",
                     header=None,
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'time'])
    df = df.drop('time', axis=1)
    df['userId'] = df['userId'].astype(int)
    df['movieId'] = df['movieId'].astype(int)
    df['rating'] = df['rating'].astype(float)
    
    df = df[['userId', 'movieId', 'rating']]
    df['rating'] = 1.
    m_codes = df['movieId'].astype('category').cat.codes
    u_codes = df['userId'].astype('category').cat.codes
    df['movieId'] = m_codes
    df['userId'] = u_codes
    
    return df


def make_triplet(df):
    df_ = df.copy()
    user_id = df['userId'].unique()
    item_id = df['movieId'].unique()
    
    negs = np.zeros(len(df), dtype=int)
    for u in tqdm(user_id):
        user_idx = list(df[df['userId']==u].index)
        n_choose = len(user_idx)
        available_negative = list(set(item_id) - set(df[df['userId']==u]['movieId'].values))
        new = np.random.choice(available_negative, n_choose, replace=True)
        
        negs[user_idx] = new
    df_['negative'] = negs
    
    return df_


def extract_from_df(df, n_positive, n_negative):
    df_ = df.copy()
    rtd = []
    
    user_id = df['userId'].unique()
    
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False))
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False))
        
    return rtd

### Model

In [None]:
class BPR_Triplet(keras.Model):
    def __init__(self, u_dim, i_dim, latent_dim):
        super(BPR_Triplet, self).__init__()
        
        self.u_dim = u_dim
        self.i_dim = i_dim
        self.latent_dim = latent_dim
        
        self.model = self.build_model()

    def compile(self, optim):
        super(BPR_Triplet, self).compile()
        self.optim = optim
    
    def build_model(self):
        u_input = Input(shape=(1, ))
        i_input = Input(shape=(1, ))

        u_emb = Flatten()(Embedding(self.u_dim, self.latent_dim, input_length=u_input.shape[1])(u_input))
        i_emb = Flatten()(Embedding(self.i_dim, self.latent_dim, input_length=i_input.shape[1])(i_input))

        mul = Dot(1)([u_emb, i_emb])

#         out = Dense(1)(mul)
        
        return Model([u_input, i_input], mul)
    
    def train_step(self, data):
        user, pos, neg = data[0]

        with tf.GradientTape() as tape:
            pos_d = self.model([user, pos])
            neg_d = self.model([user, neg])
            
            loss = -tf.reduce_mean(tf.math.log(tf.sigmoid(pos_d - neg_d)))

        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optim.apply_gradients(zip(grads, self.model.trainable_weights))
        
        return {'loss': loss}
    
    def call(self, data):
        user, item = data
        return self.model([user, item])

### Data Loading

In [None]:
!wget -q --show-progress -O movies.dat https://github.com/RecoHut-Datasets/movielens_1m/raw/main/ml1m_items.dat
!wget -q --show-progress -O ratings.dat https://github.com/RecoHut-Datasets/movielens_1m/raw/main/ml1m_ratings.dat



In [None]:
df = pd.read_csv('ratings.dat',
                     sep="\t",
                     header=None,
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'time'])

df.head()

Unnamed: 0,userId,movieId,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
uuid = df['userId'].unique()
uiid = df['movieId'].unique()

### Data Preparation

In [None]:
# [user_id, positive_item_id, negative_item_id]
df = make_triplet(df)

100%|██████████| 6040/6040 [00:32<00:00, 184.06it/s]


In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,time,negative
0,1,1193,5,978300760,3460
1,1,661,3,978302109,3803
2,1,914,3,978301968,937
3,1,3408,4,978300275,1426
4,1,2355,5,978824291,1248


In [None]:
# randomly select just one pair
rtd = extract_from_df(df, 1, 0)

100%|██████████| 6040/6040 [00:57<00:00, 105.20it/s]


In [None]:
train = df.drop(rtd)
test = df.loc[rtd]

tr_X = [
    train['userId'].values, 
    train['movieId'].values,
    train['negative'].values
]

### BPR Triplet model

In [None]:
bpr = BPR_Triplet(len(uuid), len(uiid), 32)
bpr.compile(optim=optimizers.Adam())
bpr.fit(tr_X, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4b4d3d7310>

### Evaluate

In [None]:
def eval_hit(model, test, user_id, item_ids, top_k):
    # TODO(maybe): remove negative used in train
    items = list(set(uiid) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values) - set(df[df['userId']==user_id]['negative'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])

    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = model.predict([user, items]).flatten()
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    if items[-1][0] in top_k:
            return 1
    return 0

def eval_hit_wrapper(model, test, item_ids, top_k):
    def f(user_id):
        return eval_hit(model, test, user_id, item_ids, top_k)
    return f

def eval_NDCG(model, test,user_id, item_ids, top_k):
    items = list(set(uiid) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values) - set(df[df['userId']==user_id]['negative'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])

    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = model.predict([user, items]).flatten()
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    for i, item in enumerate(top_k, 1):
        if item == test[test['userId']==user_id]['movieId'].values:
            return np.log(i) / np.log(i+2)
    return 0

def eval_NDCG_wrapper(model, test, item_ids, top_k):
    def f(user_id):
        return eval_NDCG(model, test, user_id, item_ids, top_k)
    return f

In [None]:
hr10 = list(map(eval_hit_wrapper(bpr, test, uiid, 10), uuid))
sum(hr10)/len(hr10)

0.7102649006622517

In [None]:
ndcg10 = list(map(eval_NDCG_wrapper(bpr, test, uiid, 10), uuid))
sum(ndcg10)/len(ndcg10)

0.35744256274051966

---

In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-11-28 14:52:45

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy            : 1.19.5
matplotlib       : 3.2.2
IPython          : 5.5.0
seaborn          : 0.11.2
tensorflow       : 2.7.0
keras            : 2.7.0
scipy            : 1.4.1
tensorflow_addons: 0.15.0
pandas           : 1.1.5



---

**END**