# CUHK [STAT3009](https://www.bendai.org/STAT3009/) Notebook10(b): Top-K recommendation via AdaRank

## - Part 1: report `NDCG` and `pairwise ranking 0-1 loss (PR_loss)` for existing methods

Note that `NDCG` larger one is better; `PR_loss` smaller one is better.

## define a new evaluation metric ``NDCG`` and `PR_loss`

In [1]:
from sklearn.metrics import ndcg_score
import itertools

def ndcg_rs(test_pair, true_rating, pred_rating, k=10):
    ndcg = []
    user_lst = list(set(test_pair[:,0]))
    user_index = [np.where(test_pair[:,0] == user_tmp)[0] for user_tmp in user_lst]
    for user_tmp in user_lst:
        true_rating_tmp = true_rating[user_index[user_tmp]]
        pred_rating_tmp = pred_rating[user_index[user_tmp]]
        ndcg_tmp = ndcg_score([true_rating_tmp], [pred_rating_tmp], k=k)
        ndcg.append(ndcg_tmp)
    return np.mean(ndcg)

def PR_loss(test_pair, true_rating, pred_rating):
    PR_loss_lst = []
    user_lst = list(set(test_pair[:,0]))
    user_index = [np.where(test_pair[:,0] == user_tmp)[0] for user_tmp in user_lst]
    for user_tmp in user_lst:
        record_idx_tmp = user_index[user_tmp]
        for pair_tmp in itertools.combinations(record_idx_tmp, 2):
            diff_true = true_rating[pair_tmp[0]] - true_rating[pair_tmp[1]]
            diff_pred = pred_rating[pair_tmp[0]] - pred_rating[pair_tmp[1]]
            if diff_true != 0:
            	PR_loss_lst.append( 1*(diff_true*diff_pred <= 0) )
    return np.mean(PR_loss_lst)

### load `MovieLen` dataset

In [2]:
import numpy as np
import pandas as pd
# load rating
df = pd.read_csv('./dataset/ml-latest-small/ratings.csv')
del df['timestamp']

In [3]:
## mapping the user_id, movie_id to digits
from sklearn import preprocessing

le_movie = preprocessing.LabelEncoder()
le_user = preprocessing.LabelEncoder()

df['movieId'] = le_movie.fit_transform(df['movieId'])
df['userId'] = le_user.fit_transform(df['userId'])
## generate train / test dataset
from sklearn.model_selection import train_test_split
dtrain, dtest = train_test_split(df, test_size=0.33, random_state=42)
## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])
## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

In [4]:
# tran_pair, train_rating
train_pair = dtrain[['userId', 'movieId']].values
train_rating = dtrain['rating'].values
# test_pair
test_pair = dtest[['userId', 'movieId']].values
n_user = max(train_pair[:,0].max(), test_pair[:,0].max())+1
n_item = max(train_pair[:,1].max(), test_pair[:,1].max())+1

## Part 1: report `NDCG` and `PR_loss` for existing RS methods

In [5]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, Flatten, Input, Dropout, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from IPython.display import SVG
from tensorflow import keras
from tensorflow.keras import layers

In [6]:
class NCF(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(NCF, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )
        self.concatenate = layers.Concatenate()
        self.dense1 = layers.Dense(16, name='fc-1', activation='relu')
        self.dense2 = layers.Dense(8, name='fc-2', activation='relu')
        self.dense3 = layers.Dense(1, name='fc-3', activation='linear')

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        concatted_vec = self.concatenate([user_vector, movie_vector])
        fc_1 = self.dense1(concatted_vec)
        fc_2 = self.dense2(fc_1)
        fc_3 = self.dense3(fc_2)
        return fc_3

In [7]:
model = NCF(num_users=n_user, num_movies=n_item, embedding_size=20)

metrics = [
    keras.metrics.MeanAbsoluteError(name='mae'),
    keras.metrics.RootMeanSquaredError(name='rmse')
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), 
    loss=keras.losses.MeanSquaredError(), 
    metrics=metrics
)


2021-11-29 20:16:46.199261: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 20:16:46.205722: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 20:16:46.206010: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 20:16:46.206504: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [8]:
callbacks = [keras.callbacks.EarlyStopping( 
    monitor='val_rmse', min_delta=0, patience=5, verbose=1, 
    mode='auto', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=train_pair,
    y=train_rating,
    batch_size=64,
    epochs=50,
    verbose=1,
    callbacks = callbacks,
    validation_split=.2,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping


In [9]:
## make prediction
pred_rating = model.predict(test_pair).flatten()
print('rmse: NCF: %.3f' %np.sqrt(np.mean((pred_rating - test_rating)**2)))
print('NDCG: NCF: %.3f' %ndcg_rs(test_pair, true_rating=test_rating, pred_rating=pred_rating))
print('PR_loss: NCF: %.3f' %PR_loss(test_pair, true_rating=test_rating, pred_rating=pred_rating))

rmse: NCF: 0.935
NDCG: NCF: 0.878
PR_loss: NCF: 0.387


## Part 2: Fit Neural networks with `PR_loss`

## Step 1: pre-process the dataset
- `pair`: (u, i) -> `triple`: (u, i, i')
- `outcome`: np.sign( y_i - y_{i'} )

## Step 2: Define model with two-level construction
- `score` function, to produce the scores for each item under a user.
- `diff` function, make different based on the `score`

## Step 3: Fit the neural network

In [10]:
## Step 1: pre-processing data

## get triple data
def gen_triple(pair, rating):
    triple, diff = [], []
    ## user list
    user_lst = list(set(pair[:,0]))
    user_index = [np.where(pair[:,0] == user_tmp)[0] for user_tmp in user_lst]
    for user_tmp in user_lst:
        record_idx_tmp = user_index[user_tmp]
        ## find all possible pairwise comparison of observed items under the users
        for pair_idx_tmp in itertools.combinations(record_idx_tmp, 2):
            diff_tmp = np.sign(rating[pair_idx_tmp[0]] - rating[pair_idx_tmp[1]])
            ## if diff is zero; no information; remove this triple
            if diff_tmp != 0:
                triple.append([user_tmp, pair[pair_idx_tmp[0], 1], pair[pair_idx_tmp[1], 1]])
                diff.append(diff_tmp)
    return np.array(triple), np.array(diff)

In [11]:
train_triple, train_diff = gen_triple(pair=train_pair, rating=train_rating)
## change data (-1,1) to (0,1) type
train_diff = (.5*(train_diff+1)).astype(int)

### Step 2: Define ranking model with `score` component

In [12]:
# define model
class RankNCF(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RankNCF, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )
        self.concatenate = layers.Concatenate()
        self.dense1 = layers.Dense(16, name='fc-1', activation='relu')
        self.dense2 = layers.Dense(8, name='fc-2', activation='relu')
        self.dense3 = layers.Dense(1, name='fc-3', activation='linear')

    def scorer(self, user_id, movie_id):
        user_vector = self.user_embedding(user_id)
        movie_vector = self.movie_embedding(movie_id)
        concatted_vec = self.concatenate([user_vector, movie_vector])
        fc_1 = self.dense1(concatted_vec)
        fc_2 = self.dense2(fc_1)
        fc_3 = self.dense3(fc_2)
        return fc_3

    def call(self, inputs):
        user_id = inputs[:, 0]
        movie1_id = inputs[:, 1]
        movie2_id = inputs[:, 2]
        score1 = self.scorer(user_id, movie1_id)
        score2 = self.scorer(user_id, movie2_id)
        return score1 - score2

In [13]:
model = RankNCF(num_users=n_user, num_movies=n_item, embedding_size=20)

metrics = [
    keras.metrics.BinaryAccuracy(name='binary_acc')
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), 
    loss=keras.losses.BinaryCrossentropy(from_logits=True), 
    metrics=metrics
)


In [14]:
callbacks = [keras.callbacks.EarlyStopping( 
    monitor='val_binary_acc', min_delta=0, patience=5, verbose=1, 
    mode='max', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=train_triple,
    y=train_diff,
    batch_size=64,
    epochs=50,
    verbose=1,
    callbacks = callbacks,
    validation_split=.2,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 00009: early stopping


In [15]:
pred_rating = model.scorer(user_id = test_pair[:,0], movie_id = test_pair[:,1])
pred_rating = pred_rating.numpy().flatten()

print('NDCG: RankNCF: %.3f' %ndcg_rs(test_pair, true_rating=test_rating, pred_rating=pred_rating))
print('PR_loss: RankNCF: %.3f' %PR_loss(test_pair, true_rating=test_rating, pred_rating=pred_rating))

NDCG: RankNCF: 0.869
PR_loss: RankNCF: 0.399
