## Introduction and short model overview
Here I will be implementing the GCMC (Graph convolutional matrix completion) model from [this paper](https://arxiv.org/pdf/1706.02263.pdf).\
Some hints and ideas are taken from [here](https://github.com/YuxuanLongBeyond/Graph-based-Recommendation-System/blob/master/scripts/)\
The model has the following structure:\
![](./../reports/figures/model_overview.png)


Initially we have the bipartite graph of users and items. We use this bipartite graph to obtain embeddings for users and items (not the ones that we have in the data like genres. Those embeddings will be included later) using message passing. Then, using those embeddings we use bilinear decoder to create confidence map for every rating (so map for rating 1, different map for rating 2 and so on.). Using those maps we choose the rating of item for some user.

In [2]:
import pandas as pd
import os
import numpy as np
import time
import torch
import random
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import torch.sparse as sp
import torch.nn as nn

In [3]:
seed = 1337
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [4]:
RUN_ON_GPU = torch.cuda.is_available()

In [5]:
RUN_ON_GPU

False

## Reading the data
All 3 are tensors

In [6]:
ratings = torch.load("./../data/interim/ratings.pt").float()
items = torch.load("./../data/interim/movies.pt").float() # Contains only information about genre 
users = torch.load("./../data/interim/users.pt").float() # info about gender, age, and occupation

In [7]:
num_users, num_items = ratings.shape

## Splitting the ratings into train and test.
We create a mask over existing ratings and divide this mask into train mask and test mask

In [8]:
split_ratio = 0.8 # 80% for train

In [9]:
mask = ratings > 0 
mask_new = mask + np.random.uniform(0, 1, (num_users, num_items))
train_mask = (mask_new <= (1 + split_ratio)) & mask
test_mask = (mask_new > (1 + split_ratio)) & mask

In [10]:
# check that we are ok
test_mask.sum()/(test_mask.sum() + train_mask.sum())

tensor(0.2009)

In [11]:
# Divide our rating tensor into train and test matrices

ratings_train = ratings.clone().detach()
ratings_train[test_mask] = 0

ratings_test = ratings.clone().detach()
ratings_test[train_mask] = 0

## Here we define the matrices M_r.

In [12]:
def normalize(M):
    '''
    Function to normalize M_r. Remember, when calculating H we have left-multiplication by D^-1, that is exactly the thing we do here
    '''
    s = torch.sum(M, axis = 1)
    s[s == 0] = 1
    return (M.T / s).T

In [13]:
all_M_u = []
all_M_v = []
all_M = [] # For Loss
for i in range(5): # We have 1,2,3,4,5 ratings
    # Shows which elements have rating i+1
    M_r = ratings_train == (i + 1) # because start from 0, but ratings from 1

    
    all_M_u.append(normalize(M_r))
    all_M_v.append(normalize(M_r.T))
    all_M.append(M_r.float())
    
mask = ratings_train > 0   # for train

In [14]:
all_M = torch.stack(all_M)

## Here we define initial embeddings according to the paper.
Remember, the stack of features should give identity matrix

In [15]:
### input feature generation
feature_dim = num_users + num_items
I = torch.eye(num_users + num_items)
feature_u = I[:num_users, :] 
feature_v = I[num_users :, :]

## Now lets define the model
Once again, the model is the encoder and decoder together. Their description can be seen in the report or original paper

In [16]:
def sparse_drop(feature, drop_out):
    '''
    Node dropout
    drop_out - probability of dropping the output messages of node
    '''
    tem = torch.rand((feature._nnz())) # nnz is number of non zero elements
    feature._values()[tem < drop_out] = 0
    return feature

class GCMC(nn.Module):
    def __init__(self, feature_u, 
                 feature_v, 
                 feature_dim, 
                 hidden_dim, 
                 rate_num, 
                 all_M_u, 
                 all_M_v, 
                 side_hidden_dim, 
                 side_feature_u, 
                 side_feature_v, 
                 out_dim, 
                 drop_out = 0.0):
        super(GCMC, self).__init__()
        
        self.drop_out = drop_out
        
        side_feature_u_dim = side_feature_u.shape[1]
        side_feature_v_dim = side_feature_v.shape[1]

        self.feature_u = feature_u
        self.feature_v = feature_v
        self.rate_num = rate_num
        
        self.num_user = feature_u.shape[0]
        self.num_item = feature_v.shape[1]
        
        self.side_feature_u = side_feature_u
        self.side_feature_v = side_feature_v
        
        self.W = nn.Parameter(torch.randn(rate_num, feature_dim, hidden_dim))
        nn.init.kaiming_normal_(self.W, mode = 'fan_out', nonlinearity = 'relu')
        
        self.all_M_u = all_M_u
        self.all_M_v = all_M_v
        
        self.reLU = nn.ReLU()


        
        # Side features tranaformations 
        self.linear_layer_side_u = nn.Sequential(*[nn.Linear(side_feature_u_dim, side_hidden_dim, bias = True), 
                                                    nn.BatchNorm1d(side_hidden_dim), nn.ReLU()])
        self.linear_layer_side_v = nn.Sequential(*[nn.Linear(side_feature_v_dim, side_hidden_dim, bias = True), 
                                                    nn.BatchNorm1d(side_hidden_dim), nn.ReLU()])
        
        
        # transformations of final embeddings
        self.linear_cat_u = nn.Sequential(*[nn.Linear(rate_num * hidden_dim * 2 + side_hidden_dim, out_dim, bias = True), 
                                            nn.BatchNorm1d(out_dim), nn.ReLU()])
        self.linear_cat_v = nn.Sequential(*[nn.Linear(rate_num * hidden_dim * 2 + side_hidden_dim, out_dim, bias = True), 
                                            nn.BatchNorm1d(out_dim), nn.ReLU()])   

        
        # for decoder
        self.Q = nn.Parameter(torch.randn(rate_num, out_dim, out_dim))
        nn.init.orthogonal_(self.Q)
        
        
    def forward(self):

        # Here is the node drop + normalization to have no problems with mean
        feature_u_drop = sparse_drop(self.feature_u, self.drop_out) / (1.0 - self.drop_out)
        feature_v_drop = sparse_drop(self.feature_v, self.drop_out) / (1.0 - self.drop_out)
        
        hidden_feature_u = []
        hidden_feature_v = []
        
        W_list = torch.split(self.W, self.rate_num) 
        W_flat = []
        for i in range(self.rate_num): # iterate over every rating
            Wr = W_list[0][i]
            
            M_u = self.all_M_u[i]
            M_v = self.all_M_v[i] # Just M_u transposed
            
            # H_u from paper. The embeddings
            hidden_u = sp.mm(feature_v_drop, Wr)
            hidden_u = self.reLU(sp.mm(M_u, hidden_u))

            # H_v
            hidden_v = sp.mm(feature_u_drop, Wr)
            hidden_v = self.reLU(sp.mm(M_v, hidden_v))

            
            hidden_feature_u.append(hidden_u)
            hidden_feature_v.append(hidden_v)
            
            W_flat.append(Wr)

        
        hidden_feature_u = torch.cat(hidden_feature_u, dim = 1)
        hidden_feature_v = torch.cat(hidden_feature_v, dim = 1)
        # Now we have H_u and H_v. Note that there is no non-linearity at the end, because we did ReLU before and ReLU can be done separately.
        W_flat = torch.cat(W_flat, dim = 1) 

        # Here we add self-messages
        cat_u = torch.cat((hidden_feature_u, torch.mm(self.feature_u, W_flat)), dim = 1)
        cat_v = torch.cat((hidden_feature_v, torch.mm(self.feature_v, W_flat)), dim = 1)

        # Here we transform side-featurs and add them to our embeddings
        side_hidden_feature_u = self.linear_layer_side_u(self.side_feature_u)
        side_hidden_feature_v = self.linear_layer_side_v(self.side_feature_v)    
        cat_u = torch.cat((cat_u, side_hidden_feature_u), dim = 1)
        cat_v = torch.cat((cat_v, side_hidden_feature_v), dim = 1)
        
        # Final embeddings
        embed_u = self.linear_cat_u(cat_u)
        embed_v = self.linear_cat_v(cat_v)

        
        # Decoder part -------------------------
        
        score = [] # Confidence map
        
        Q_list = torch.split(self.Q, self.rate_num)
        for i in range(self.rate_num):
            Qr = Q_list[0][i]
            tem = torch.mm(torch.mm(embed_u, Qr), torch.t(embed_v))
            
            score.append(tem)

        score = torch.stack(score)
        return score

## Utilities

In [17]:
def to_sparse(x):
    '''
    Function to convert dense tensor x to sparse tensor
    
    '''
    # Save the original type
    x_typename = torch.typename(x).split('.')[-1] # FloatTensor usually
    sparse_tensortype = getattr(torch.sparse, x_typename) # torch.sparse.FloatTensor usually

    indices = torch.nonzero(x)
    if len(indices.shape) == 0: # If all elements are zero, then we return zero sparse tensor 
        return sparse_tensortype(*x.shape)

    # Creating the sparse tensor according to documentation
    indices = indices.t()
    values = x[tuple(indices[i] for i in range(indices.shape[0]))]
    return sparse_tensortype(indices, values, x.size())

In [18]:
def create_model(feature_u, feature_v, feature_dim, hidden_dim, rate_num, all_M_u, all_M_v, 
                 side_hidden_dim, side_feature_u, side_feature_v, out_dim, drop_out = 0.0):
    ''' 
    This function prepares the model. 
    It 1) converts neccesary tensors to sparse format 2) moves model to cuda 3) returns the model
    '''
    for i in range(rate_num):
        all_M_u[i] = to_sparse(all_M_u[i])
        all_M_v[i] = to_sparse(all_M_v[i])
    
    feature_u = to_sparse(feature_u)
    feature_v = to_sparse(feature_v)

    net = GCMC(feature_u, feature_v, feature_dim, hidden_dim, rate_num, all_M_u, all_M_v, 
                 side_hidden_dim, side_feature_u, side_feature_v, out_dim, drop_out)

    if RUN_ON_GPU:
        print('Moving models to GPU.')
        net.cuda()
    else:
        print('Keeping models on CPU.')

    return net

In [19]:
class Loss(nn.Module):
    def __init__(self, all_M, mask, user_item_matrix):
            
        super(Loss, self).__init__()
            
        self.all_M = all_M
        self.mask = mask
        self.user_item_matrix = user_item_matrix
        
        self.rate_num = all_M.shape[0]
        self.num = float(mask.sum())
        
        self.logsm = nn.LogSoftmax(dim = 0)
        self.sm = nn.Softmax(dim = 0)
        
    def cross_entropy(self, score):
        l = torch.sum(-self.all_M * self.logsm(score))
        return l / self.num
    
    def rmse(self, score):
        score_list = torch.split(self.sm(score), self.rate_num)
        total_score = 0
        for i in range(self.rate_num):
            total_score += (i + 1) * score_list[0][i]
        
        square_err = torch.pow(total_score * self.mask - self.user_item_matrix, 2)
        mse = torch.sum(square_err) / self.num
        return torch.sqrt(mse)
        
    def loss(self, score):
        return self.cross_entropy(score)

## Training

In [20]:
def validate(score, rate_num, user_item_matrix_test):
    sm = nn.Softmax(dim = 0)
    score = sm(score)
    score_list = torch.split(score, rate_num)
    pred = 0
    for i in range(rate_num):
        pred += (i + 1) * score_list[0][i]

    
    test_mask = user_item_matrix_test > 0
    square_err = (pred * test_mask - user_item_matrix_test) ** 2
    mse = square_err.sum() / test_mask.sum()
    test_rmse = torch.sqrt(mse)
    
    return test_rmse

In [23]:
def main(
    rate_num,
    lr,
    weight_decay,
    num_epochs,
    hidden_dim,
    side_hidden_dim,
    out_dim,
    drop_out,
    saved_model_folder,
    log_dir
):
    writer = SummaryWriter(log_dir=log_dir)
    
    post_fix = '/best_model.pt'
    
    if not os.path.exists(saved_model_folder):
        os.makedirs(saved_model_folder)  
    
    weights_name = saved_model_folder + post_fix


    net = create_model(feature_u, feature_v, feature_dim, hidden_dim, rate_num, all_M_u, all_M_v, 
                 side_hidden_dim, users, items, out_dim, drop_out)
    net.train() # in train mode

    optimizer = torch.optim.Adam(net.parameters(), lr = lr, weight_decay = weight_decay)
    loss_obj = Loss(all_M, mask, ratings_train)
    iter_bar = tqdm(range(num_epochs))

    best_val_rmse = 1000000
    
    for epoch in iter_bar:

        optimizer.zero_grad()

        score = net.forward() # Here we get the Confidence map WITHOUT softmax at the end
        loss = loss_obj.loss(score) # cross entropy 

        loss.backward()

        optimizer.step()

        with torch.no_grad():
            rmse = loss_obj.rmse(score)
            
            val_rmse = validate(score, rate_num, ratings_test)
            iter_bar.set_description('Iter (loss=%5.3f, rmse=%5.3f, val_rmse=%5.5f)'%(loss.item(),rmse.item(), val_rmse.item()))
            
            writer.add_scalar("RMSE/test", val_rmse.item(), epoch)
            writer.add_scalar("RMSE/train", rmse.item(), epoch)
            
            if (val_rmse.item() < best_val_rmse) and (epoch > 150):
                torch.save(net, weights_name)
                best_val_rmse = val_rmse.item()
    print('Best Validation RMSE: ', best_val_rmse)

In [24]:
main(rate_num=5, 
     lr = 1e-2,
     weight_decay=0.005, 
     num_epochs=500,
     hidden_dim=5, 
     side_hidden_dim=5, 
     out_dim=5, 
     drop_out=0.0,
     saved_model_folder="./../models",
     log_dir = "./../tensorboard_logs"
    )

  return sparse_tensortype(indices, values, x.size())


Keeping models on CPU.


Iter (loss=1.101, rmse=0.844, val_rmse=0.91699): 100%|███████████████████████████████| 500/500 [01:16<00:00,  6.51it/s]

Best Validation RMSE:  0.9162934422492981



