In [1]:
import os
project_name = "reco-tut-mlh"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [2]:
if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

/content/reco-tut-mlh
Initialized empty Git repository in /content/reco-tut-mlh/.git/
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 39 (delta 7), reused 37 (delta 7), pack-reused 0[K
Unpacking objects: 100% (39/39), done.
From https://github.com/sparsh-ai/reco-tut-mlh
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [36]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mcode/build_features.py[m
	[31mcode/metrics.py[m
	[31mcode/models/[m
	[31mcode/utils.py[m

nothing added to commit but untracked files present (use "git add" to track)


In [38]:
!git pull --rebase origin main

From https://github.com/sparsh-ai/reco-tut-mlh
 * branch            main       -> FETCH_HEAD
Updating 96c6697..0ba38f3
Fast-forward
 code/metrics.py | 55 [32m+++++++++++++++++++++++++++++++++++++++++[m
 code/utils.py   | 76 [32m+++++++++++++++++++++++++++++++++++++++++++++++++++++++++[m
 2 files changed, 131 insertions(+)
 create mode 100644 code/metrics.py
 create mode 100644 code/utils.py
Current branch main is up to date.


In [39]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

[main 2f2caf4] commit
 3 files changed, 754 insertions(+)
 create mode 100644 code/build_features.py
 create mode 100644 code/models/SVAE.py
 create mode 100644 code/models/__init__.py
Counting objects: 6, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 8.41 KiB | 8.42 MiB/s, done.
Total 6 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/sparsh-ai/reco-tut-mlh.git
   0ba38f3..2f2caf4  main -> main


In [3]:
import sys
sys.path.insert(0,'./code')

---

# Standard Variational Autoencoder (SVAE)

The Standard Variational Autoencoder (SVAE), SVAE uses an autoencoder to generate a salient feature representation of users, learning a latent vector for each user. The decoder then takes this latent representation and outputs a probability distribution over all items; we get probabilities of all the movies being watched by each user.

# Imports

In [27]:
import numpy as np
import os
import pandas as pd

from utils import numpy_stratified_split
import build_features
import metrics
from models import SVAE
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

# Prepare Data

In [6]:
fp = os.path.join('./data/bronze', 'u.data')
raw_data = pd.read_csv(fp, sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
print(f'Shape: {raw_data.shape}')
raw_data.sample(5, random_state=123)

Shape: (100000, 4)


Unnamed: 0,userId,movieId,rating,timestamp
42083,600,651,4,888451492
71825,607,494,5,883879556
99535,875,1103,5,876465144
47879,648,238,3,882213535
36734,113,273,4,875935609


In [7]:
# Binarize the data (only keep ratings >= 4)
df_preferred = raw_data[raw_data['rating'] > 3.5]
print (df_preferred.shape)
df_low_rating = raw_data[raw_data['rating'] <= 3.5]

df_preferred.head(10)

(55375, 4)


Unnamed: 0,userId,movieId,rating,timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
18,291,1042,4,874834944
20,119,392,4,886176814
21,167,486,4,892738452
22,299,144,4,877881320
24,308,1,4,887736532


In [8]:
# Keep users who clicked on at least 5 movies
df = df_preferred.groupby('userId').filter(lambda x: len(x) >= 5)

# Keep movies that were clicked on by at least on 1 user
df = df.groupby('movieId').filter(lambda x: len(x) >= 1)

print(df.shape)

(55361, 4)


In [9]:
# Obtain both usercount and itemcount after filtering
usercount = df[['userId']].groupby('userId', as_index = False).size()
itemcount = df[['movieId']].groupby('movieId', as_index = False).size()

# Compute sparsity after filtering
sparsity = 1. * raw_data.shape[0] / (usercount.shape[0] * itemcount.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], usercount.shape[0], itemcount.shape[0], sparsity * 100))

After filtering, there are 100000 watching events from 938 users and 1447 movies (sparsity: 7.368%)


## Split

In [10]:
unique_users =sorted(df.userId.unique())
np.random.seed(123)
unique_users = np.random.permutation(unique_users)

In [11]:
HELDOUT_USERS = 200

# Create train/validation/test users
n_users = len(unique_users)
print("Number of unique users:", n_users)

train_users = unique_users[:(n_users - HELDOUT_USERS * 2)]
print("\nNumber of training users:", len(train_users))

val_users = unique_users[(n_users - HELDOUT_USERS * 2) : (n_users - HELDOUT_USERS)]
print("\nNumber of validation users:", len(val_users))

test_users = unique_users[(n_users - HELDOUT_USERS):]
print("\nNumber of test users:", len(test_users))

Number of unique users: 938

Number of training users: 538

Number of validation users: 200

Number of test users: 200


In [12]:
# For training set keep only users that are in train_users list
train_set = df.loc[df['userId'].isin(train_users)]
print("Number of training observations: ", train_set.shape[0])

# For validation set keep only users that are in val_users list
val_set = df.loc[df['userId'].isin(val_users)]
print("\nNumber of validation observations: ", val_set.shape[0])

# For test set keep only users that are in test_users list
test_set = df.loc[df['userId'].isin(test_users)]
print("\nNumber of test observations: ", test_set.shape[0])

# train_set/val_set/test_set contain user - movie interactions with rating 4 or 5

Number of training observations:  32491

Number of validation observations:  11647

Number of test observations:  11223


In [13]:
# Obtain list of unique movies used in training set
unique_train_items = pd.unique(train_set['movieId'])
print("Number of unique movies that rated in training set", unique_train_items.size)

Number of unique movies that rated in training set 1346


In [14]:
# For validation set keep only movies that used in training set
val_set = val_set.loc[val_set['movieId'].isin(unique_train_items)]
print("Number of validation observations after filtering: ", val_set.shape[0])

# For test set keep only movies that used in training set
test_set = test_set.loc[test_set['movieId'].isin(unique_train_items)]
print("\nNumber of test observations after filtering: ", test_set.shape[0])

Number of validation observations after filtering:  11562

Number of test observations after filtering:  11155


In [16]:
# Instantiate the sparse matrix generation for train, validation and test sets
# use list of unique items from training set for all sets
am_train = build_features.AffinityMatrix(df=train_set, items_list=unique_train_items)

am_val = build_features.AffinityMatrix(df=val_set, items_list=unique_train_items)

am_test = build_features.AffinityMatrix(df=test_set, items_list=unique_train_items)

In [17]:
# Obtain the sparse matrix for train, validation and test sets
train_data, _, _ = am_train.gen_affinity_matrix()
print(train_data.shape)

val_data, val_map_users, val_map_items = am_val.gen_affinity_matrix()
print(val_data.shape)

test_data, test_map_users, test_map_items = am_test.gen_affinity_matrix()
print(test_data.shape)

(538, 1346)
(200, 1346)
(200, 1346)


In [21]:
# Split validation and test data into training and testing parts
val_data_tr, val_data_te = numpy_stratified_split(val_data, ratio=0.75, seed=123)
test_data_tr, test_data_te = numpy_stratified_split(test_data, ratio=0.75, seed=123)

In [22]:
# Binarize train, validation and test data
train_data = np.where(train_data > 3.5, 1.0, 0.0)
val_data = np.where(val_data > 3.5, 1.0, 0.0)
test_data = np.where(test_data > 3.5, 1.0, 0.0)

# Binarize validation data: training part  
val_data_tr = np.where(val_data_tr > 3.5, 1.0, 0.0)
# Binarize validation data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)
val_data_te_ratings = val_data_te.copy()
val_data_te = np.where(val_data_te > 3.5, 1.0, 0.0)

# Binarize test data: training part 
test_data_tr = np.where(test_data_tr > 3.5, 1.0, 0.0)

# Binarize test data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)
test_data_te_ratings = test_data_te.copy()
test_data_te = np.where(test_data_te > 3.5, 1.0, 0.0)

In [23]:
# retrieve real ratings from initial dataset 

test_data_te_ratings=pd.DataFrame(test_data_te_ratings)
val_data_te_ratings=pd.DataFrame(val_data_te_ratings)

for index,i in df_low_rating.iterrows():
    user_old= i['userId'] # old value 
    item_old=i['movieId'] # old value 

    if (test_map_users.get(user_old) is not None)  and (test_map_items.get(item_old) is not None) :
        user_new=test_map_users.get(user_old) # new value 
        item_new=test_map_items.get(item_old) # new value 
        rating=i['rating'] 
        test_data_te_ratings.at[user_new,item_new]= rating   

    if (val_map_users.get(user_old) is not None)  and (val_map_items.get(item_old) is not None) :
        user_new=val_map_users.get(user_old) # new value 
        item_new=val_map_items.get(item_old) # new value 
        rating=i['rating'] 
        val_data_te_ratings.at[user_new,item_new]= rating   


val_data_te_ratings=val_data_te_ratings.to_numpy()    
test_data_te_ratings=test_data_te_ratings.to_numpy()    

# SVAE

In [29]:
INTERMEDIATE_DIM = 200
LATENT_DIM = 64
EPOCHS = 400
BATCH_SIZE = 100

In [30]:
model = SVAE.StandardVAE(n_users=train_data.shape[0], # Number of unique users in the training set
                                   original_dim=train_data.shape[1], # Number of unique items in the training set
                                   intermediate_dim=INTERMEDIATE_DIM, 
                                   latent_dim=LATENT_DIM, 
                                   n_epochs=EPOCHS, 
                                   batch_size=BATCH_SIZE, 
                                   k=10,
                                   verbose=0,
                                   seed=123,
                                   drop_encoder=0.5,
                                   drop_decoder=0.5,
                                   annealing=False,
                                   beta=1.0
                                   )

In [31]:
%%time
model.fit(x_train=train_data,
          x_valid=val_data,
          x_val_tr=val_data_tr,
          x_val_te=val_data_te_ratings, # with the original ratings
          mapper=am_val
          )



CPU times: user 3min 58s, sys: 8.51 s, total: 4min 7s
Wall time: 3min 33s


# Recommend

In [32]:
# Model prediction on the training part of test set 
top_k =  model.recommend_k_items(x=test_data_tr,k=10,remove_seen=True)

# Convert sparse matrix back to df
recommendations = am_test.map_back_sparse(top_k, kind='prediction')
test_df = am_test.map_back_sparse(test_data_te_ratings, kind='ratings') # use test_data_te_, with the original ratings

## Evaluation metrics

In [33]:
# Create column with the predicted movie's rank for each user 
top_k = recommendations.copy()
top_k['rank'] = recommendations.groupby('userId', sort=False).cumcount() + 1  # For each user, only include movies recommendations that are also in the test set

In [34]:
precision_at_k = metrics.precision_at_k(top_k, test_df, 'userId', 'movieId', 'rank')
recall_at_k = metrics.recall_at_k(top_k, test_df, 'userId', 'movieId', 'rank')
mean_average_precision = metrics.mean_average_precision(top_k, test_df, 'userId', 'movieId', 'rank')
ndcg = metrics.ndcg(top_k, test_df, 'userId', 'movieId', 'rank')

In [35]:
print(f'Precision: {precision_at_k:.6f}',
      f'Recall: {recall_at_k:.6f}',
      f'MAP: {mean_average_precision:.6f} ',
      f'NDCG: {ndcg:.6f}', sep='\n')

Precision: 0.036500
Recall: 0.007136
MAP: 0.003726 
NDCG: 0.049065


# References


1.   Kilol Gupta, Mukunds Y. Raghuprasad, Pankhuri Kumar, A Hybrid Variational Autoencoder for Collaborative Filtering, 2018, https://arxiv.org/pdf/1808.01006.pdf

2.   Microsoft SVAE implementation: https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/standard_vae_deep_dive.ipynb
