# CML on ML-1m

In [None]:
import os
project_name = "reco-chef"; branch = "ml1m"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [None]:
if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout "{branch}"
else:
    %cd "{project_path}"

[K     |████████████████████████████████| 644 kB 8.3 MB/s 
[K     |████████████████████████████████| 40 kB 15 kB/s 
[K     |████████████████████████████████| 530 kB 56.2 MB/s 
[K     |████████████████████████████████| 211 kB 70.5 MB/s 
[K     |████████████████████████████████| 44 kB 2.6 MB/s 
[K     |████████████████████████████████| 296 kB 71.7 MB/s 
[K     |████████████████████████████████| 170 kB 77.2 MB/s 
[K     |████████████████████████████████| 119 kB 46.6 MB/s 
[K     |████████████████████████████████| 49 kB 5.6 MB/s 
[K     |████████████████████████████████| 4.6 MB 69.2 MB/s 
[K     |████████████████████████████████| 109 kB 60.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 201 kB 40.5 MB/s 
[K     |████████████████████████████████| 64 kB 2.8 MB/s 
[K     |████████████████████████████████| 2.6 MB 51.6 MB/s 
[K     |████████████████████████████████| 51 kB 6.1 MB/s 
[K     |█████████████████████████████

In [None]:
!git status

In [None]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

In [None]:
!dvc pull ./data/bronze/ml-1m/*.dvc

  0% 0/1 [00:00<?, ?file/s{'info': ''}]Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Enter verification code: 4/1AX4XfWggDh4ac5EL3V0xDrdPbblNKv7bS7VHVz6B1_tyP76Xx3YPDa8Z-9M
Authentication successful.
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |9fd9ff8d1b33faf73e2122c371e910     0.00/? [00:00<?,        ?B/s][A
9fd9ff8d1b33faf73e2122c371e910:   0% 0.00/167k [00:00<?, ?B/s{'info': ''}]      [A
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |9aa3591bc97d6d4e0c89459ff39362     0.00/? [00:00<?,        ?B/s][A
9aa3591bc97d6d4e0c89459ff39362:   0% 0.00/23.5M [00:00<?, ?B/s{'info': ''}]     [A

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm.notebook import tqdm
import random
import heapq

from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses, models
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

In [None]:
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [None]:
def load_data(filepath, threshold=0):
    df = pd.read_csv(filepath,
                     sep="::",
                     header=None,
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'time'])
    df = df.drop('time', axis=1)
    df['userId'] = df['userId'].astype(int)
    df['movieId'] = df['movieId'].astype(int)
    df['rating'] = df['rating'].astype(float)
    
    df = df[['userId', 'movieId', 'rating']]
    if threshold > 0:
        df['rating'] = np.where(df['rating']>threshold, 1, 0)  
    else:
        df['rating'] = 1.
    m_codes = df['movieId'].astype('category').cat.codes
    u_codes = df['userId'].astype('category').cat.codes
    df['movieId'] = m_codes
    df['userId'] = u_codes

    return df
    

def add_negative(df, uiid, times=4):
    df_ = df.copy()
    user_id = df_['userId'].unique()
    item_id = df_['movieId'].unique()
    
    for i in tqdm(user_id):
        cnt = 0
        n = len(df_[df_['userId']==i])
        n_negative = min(n*times, len(item_id)-n-1)
        available_negative = list(set(uiid) - set(df[df['userId']==i]['movieId'].values))
        
        new = np.random.choice(available_negative, n_negative, replace=False)
        new = [[i, j, 0] for j in new]
        df_ = df_.append(pd.DataFrame(new, columns=df.columns), ignore_index=True)
    
    return df_

def extract_from_df(df, n_positive, n_negative):
    df_ = df.copy()
    rtd = []
    
    user_id = df['userId'].unique()
    
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False))
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False))
        
    return rtd

In [None]:
def eval_NDCG(true, pred):
    top_k = pred

    for i, item in enumerate(top_k, 1):
        if item == true:
            return 1 / np.log2(i+1)
    return 0

## CML

### Load data

In [None]:
df = load_data('./data/bronze/ml-1m/ratings.dat', threshold=3)
df.head()

Unnamed: 0,userId,movieId,rating
0,0,1104,1
1,0,639,0
2,0,853,0
3,0,3177,1
4,0,2162,1


### Preprocessing

In [None]:
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)

cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
test_idx = []
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
for i in tdf.index:
    test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
    
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]
def make_trpilet(df, uiid):
    uid_map = {}
    for user in df['userId'].unique():
        uid_map[user] = list(set(uiid) - set(df[df['userId']==user]['movieId'].unique()))

    negs = []
    for i in tqdm(range(len(df))):
        user = df.values[i][0]
        valid_negs = uid_map[user]
        negs.append(np.random.choice(list(valid_negs)))
        
    df['neg'] = negs
    return df

### Model architecture

In [None]:
class CML(models.Model):
    def __init__(self, n_users, n_items, emb_dim, feature_shape=None):
        super().__init__()
        self.emb_dim = emb_dim
        self.feature_shape=feature_shape
        self.margin = 1.
#         self.use_rank_weight = True
        self.use_cov_loss = False
        
        # reg weights
        self.feature_l2_reg = 0.1
        self.feature_projection_scaling_factor = 0.5
        self.cov_loss_weight = 0.1
        
        self.clip_norm = 1.
        
        self.user_embedding = Embedding(n_users, emb_dim)
        self.item_embedding = Embedding(n_items, emb_dim)
        
        if self.feature_shape is not None:
            self.mlp = Sequential([
                Dense(self.feature_shape[0], activation='relu'),
                Dense(emb_dim)
            ])
            
    def call(self, inputs):
        user = inputs[:,0]
        item = inputs[:,1]
        
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        
        return -tf.reduce_sum(
            tf.square(user_emb-item_emb), 1
        )
        
        
    def train_step(self, inputs):
        with tf.GradientTape() as tape:
            loss = self.get_loss(inputs)
        
        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {'loss': loss}
            
            
    def get_loss(self, inputs):
        X = inputs
        loss = self._embedding_loss(X)
        if self.use_cov_loss:
            loss += self._covariance_loss()
        return loss
    
    def _embedding_loss(self, inputs):
        X = inputs
        users = self.user_embedding(X[:, 0])

        pos_items = self.item_embedding(X[:, 1])
        neg_items = self.item_embedding(X[:, 2])
        
        pos_distances = tf.reduce_sum((users - pos_items) ** 2, 1)
        distance_to_neg_items = tf.reduce_sum((users - neg_items) ** 2, 1)

        # best negative item (among W negative samples) their distance to the user embedding (N)
        closest_negative_item_distances = tf.reduce_min(distance_to_neg_items) #distance_to_neg_items.min(1)[0]

        # compute hinge loss (N)
        distance = pos_distances - closest_negative_item_distances + self.margin
        loss_per_pair = tf.nn.relu(distance) #[]+

#         if self.use_rank_weight:
#             # indicator matrix for impostors (N x W)
#             impostors = (pos_distances - distance_to_neg_items + self.margin) > 0
#             # approximate the rank of positive item by (number of impostor / W per user-positive pair)
#             rank = impostors.float().mean(1) * self.n_items
#             # apply rank weight
#             loss_per_pair *= torch.log(rank + 1)

        # the embedding loss
        loss = tf.reduce_sum(loss_per_pair)

        return loss
    
    def _feature_projection(self):
        if self.features is not None:
            output = self.mlp(self.features) * self.feature_projection_scaling_factor
            # projection to the embedding
            return tf.clip_by_norm(output, self.clip_norm)

    def _feature_loss(self):
        loss = 0
        if feature_projection is not None:
            feature_projection = self._feature_projection()
            loss = tf.reduce_sum((self.item_embedding.weights - feature_projection) ** 2) * self.feature_l2_reg
        return loss

    def _covariance_loss(self):
        X = tf.concat([self.item_embedding.weights[0], self.user_embedding.weights[0]], 0)
        n_rows = X.shape[0]
        X -= tf.reduce_mean(X, 0)
        cov = tf.matmul(X, X, transpose_a=True) / n_rows
        loss = tf.reduce_sum(cov) - tf.linalg.trace(cov)
        return loss * self.cov_loss_weight

## Training

In [None]:
train = make_trpilet(train, df['movieId'].unique())
train.head()

  0%|          | 0/564569 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,userId,movieId,rating,neg
0,0,1104,1,801
1,0,3177,1,2573
2,0,2162,1,326
3,0,1195,1,555
4,0,2599,1,989


In [None]:
n_user = df['userId'].unique().max()+1
n_item = df['movieId'].unique().max()+1

In [None]:
model = CML(n_user, n_item, 16)
model.compile(loss='mse', optimizer='adam')
model.fit(train.values, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f43818c7c10>

### Evaluation

In [None]:
uiid = df['movieId'].unique()
top_k = 10

scores = []
for user in tqdm(df['userId'].unique()):
    user_in = np.full((len(uiid)), user)
    inputs = np.dstack([user_in, uiid])[0]
    preds = model.predict(inputs)
    
    item_to_pred = dict(zip(uiid, preds))
    test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
    used = train[train['userId']==user]['movieId'].values
    items = list(np.random.choice(list(filter(lambda x: x not in used, item_to_pred.keys())), 100)) + list(test_)
    top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
    
    score = eval_NDCG(test_, top_k_items)
    scores.append(score)

np.mean(scores)

  0%|          | 0/5948 [00:00<?, ?it/s]

0.053137418352365455