In [33]:
# Import PyTorch Packages
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
import os

# Import PyTorch Ignite
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss
from ignite.metrics import MeanSquaredError

from itertools import product

In [34]:
class MF(nn.Module):
    
    def __init__(self, num_users, num_items, k, c_vector,  tag_relevance_matrix=None):
        '''
        '''
        super(MF, self).__init__()
        
        self.num_users = num_users
        self.num_items = num_items
        self.tag_relevance_matrix = tag_relevance_matrix
        self.k = k
        self.gamma = self.num_users / self.num_items
        self.c_vector = c_vector
        
        #initialize U and I matrices 
        self.user_embedding = nn.Embedding(num_users, k)
        self.item_embedding = nn.Embedding(num_items, k)
        
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.item_embedding.weight.data.uniform_(0, 0.05)
        self.user_bias.weight.data.uniform_(0.01)
        self.item_bias.weight.data.uniform_(0.01)
        
    def __call__(self, train_x):
        users = train_x[:, 0]
        items = train_x[:, 1]
        
        self.users = users
        self.items = items
        
        u = self.user_embedding(torch.LongTensor(users))
        v = self.item_embedding(torch.LongTensor(items))
        
        #if self.tag_relevance_matrix is not None:
        #    self.sim_mat = self.compute_cosine_similarity(torch.index_select(self.tag_relevance_matrix, 0, torch.LongTensor(items)))
        
        b_u = self.user_bias(torch.LongTensor(users)).squeeze()
        b_v = self.item_bias(torch.LongTensor(items)).squeeze()
        r_predicted = (u*v).sum(1) + b_u + b_v
        
        return r_predicted
    
    
    def loss(self, r_hat, r):
        loss_mse = F.mse_loss(r_hat, r)
        
        reg_term_users = self.l2_regularize(self.user_embedding.weight) * self.c_vector
        reg_term_items = self.l2_regularize(self.item_embedding.weight) * self.c_vector * self.gamma
        reg_term_user_bias = self.l2_regularize(self.user_embedding.weight) * self.c_vector
        reg_term_item_bias = self.l2_regularize(self.item_embedding.weight) * self.c_vector * self.gamma
        #tag_based_penalty = self.compute_tag_based_penalty(self.item_embedding, self.items, self.sim_mat)
        
        total_loss = loss_mse + reg_term_users + reg_term_items + reg_term_user_bias + reg_term_item_bias #+ tag_based_penalty
        return total_loss
    
    
    def l2_regularize(self, v):
        #loss = torch.sum(v ** 2.0)
        loss = torch.linalg.norm(v)

        return loss
    
    def compute_tag_based_penalty(self, feature_vector, index_vector, sim_mat):
        feature_vector = feature_vector(torch.LongTensor(index_vector))
        res = 0 
        for i in range(0,len(index_vector)):
            v_i = feature_vector[i]
            for j in range(0,len(index_vector)):
                v_j = feature_vector[j]
                temp = v_i - v_j
                res += torch.sum(temp**2) * sim_mat[i,j]
                
        return res
        
    def compute_cosine_similarity(self, m):
        m_norm = m / m.norm(dim=1)[:, None]
        res = torch.mm(m_norm, m_norm.transpose(0,1))
        return res.numpy()

In [3]:
# Import Libraries
import torch
from sklearn.utils import shuffle


# Initialize a Loader class
class Loader():
    # Set the iterator
    current = 0

    def __init__(self, x, y, batchsize=1024, do_shuffle=True):
        """
        :param x: features
        :param y: target
        :param batchsize: batch size = 1024
        :param do_shuffle: shuffle mode turned on
        """
        self.shuffle = shuffle
        self.x = x
        self.y = y
        self.batchsize = batchsize
        self.batches = range(0, len(self.y), batchsize)
        if do_shuffle:
            # Every epoch re-shuffle the dataset
            self.x, self.y = shuffle(self.x, self.y)

    def __iter__(self):
        # Reset & return a new iterator
        self.x, self.y = shuffle(self.x, self.y, random_state=0)
        self.current = 0
        return self

    def __len__(self):
        # Return the number of batches
        return int(len(self.x) / self.batchsize)

    def __next__(self):
        # Update iterator and stop iteration until the batch size is out of range
        n = self.batchsize
        if self.current + n >= len(self.y):
            raise StopIteration
        i = self.current

        # Transform NumPy arrays to PyTorch tensors
        xs = torch.from_numpy(self.x[i:i + n])
        ys = torch.from_numpy(self.y[i:i + n])
        self.current += n
        return xs, ys


In [4]:
def log_training_loss(engine, log_interval=500):
    """
    Function to log the training loss
    """
    model.itr = engine.state.iteration  # Keep track of iterations
    if model.itr % log_interval == 0:
        fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
        # Keep track of epochs and outputs
        msg = fmt.format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.output)
        print(msg)


def log_validation_results(engine):
    """
    Function to log the validation loss
    """
    # When triggered, run the validation set
    evaluator.run(test_loader)
    # Keep track of the evaluation metrics
    avg_loss = evaluator.state.metrics['evaluation']
    print("Epoch[{}] Validation MSE: {:.2f} ".format(engine.state.epoch, avg_loss))



In [3]:
path = 'ml-20m'
#genome_scores = pd.read_csv(os.path.join(path,'genome-scores.csv'))
#genome_tags = pd.read_csv(os.path.join(path,'genome-tags.csv'))
#tags = pd.read_csv(os.path.join(path,'tags.csv'))
movies_full = pd.read_csv(os.path.join(path,'movies.csv'))
ratings_full = pd.read_csv(os.path.join(path,'ratings.csv'))



In [4]:
#Pre-process

#remove on deployment
ratings = ratings_full.head(10000000)

#remove movies without ratings
movies = movies_full[movies_full['movieId'].isin(ratings['movieId'].unique())]

#remap ids to continuous integers
user_ids = np.sort(np.unique(ratings['userId']))
userid2idx = {o:i for i,o in enumerate(user_ids) }

movie_ids = np.sort(np.unique(ratings['movieId']))
movieid2idx = {o:i for i,o in enumerate(movie_ids) }

ratings['userId'] = ratings['userId'].apply(lambda x : userid2idx[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x : movieid2idx[x])
movies['movieId'] = movies['movieId'].apply(lambda x : movieid2idx[x])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['userId'] = ratings['userId'].apply(lambda x : userid2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['movieId'] = ratings['movieId'].apply(lambda x : movieid2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['movieId'] = movies['movieId'].apply(lambda x : movieid

In [8]:
#movie_tag_pair = list(product(movieid2idx.values(), tagid2idx.values()))
#movie_tag_pair_df = pd.DataFrame(movie_tag_pair, columns =['movieId', 'tagId'])
#m = movie_tag_pair_df.merge(genome_scores, on=['movieId','tagId'], how='left')
#movie_tag_relevance = m.pivot_table(index='movieId', columns=['tagId'], values='relevance', dropna=False, fill_value=0)
#movie_tag_relevance = torch.from_numpy(movie_tag_relevance.values)

In [56]:
# Set a random seed to make our numbers predictable
np.random.seed(42)

ratings['is_train'] = np.random.random(len(ratings)) < 0.95
training_data = ratings[ratings['is_train']]
test_data = ratings[~ratings['is_train']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['is_train'] = np.random.random(len(ratings)) < 0.95


In [None]:
%%time

lr = 1e-2 
k = 3 
c_vector = 1e-6  

# Instantiate the MF class object
model = MF(len(user_ids), len(movie_ids), tag_relevance_matrix=None, k=k, c_vector=c_vector)

# Use Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
# Create a supervised trainer
trainer = create_supervised_trainer(model, optimizer, model.loss)

# Use Mean Squared Error as evaluation metric
metrics = {'evaluation': MeanSquaredError()}

# Create a supervised evaluator
evaluator = create_supervised_evaluator(model, metrics=metrics)

# Load the train and test data

train_x = training_data[['userId', 'movieId']].values
train_y = training_data['rating'].values.astype(np.float32)
test_x = test_data[['userId', 'movieId']].values
test_y = test_data['rating'].values.astype(np.float32)
train_loader = Loader(train_x, train_y, batchsize=1024)
test_loader = Loader(test_x, test_y, batchsize=1024)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)
trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=log_validation_results)

# Run the model for 50 epochs
trainer.run(train_loader, max_epochs=50)


# When triggered, run the validation set
evaluator.run(test_loader)
# Keep track of the evaluation metrics
avg_loss = evaluator.state.metrics['evaluation']
avg_loss






In [10]:
u = model.user_embedding 
v = model.item_embedding 
b_u = model.user_bias
b_v = model.item_bias
r_hat = torch.mm(u.weight, v.weight.T)
r_pred = r_hat + b_u.weight + b_v.weight.squeeze()

In [32]:
#check recommendations
user = 0
user_ratings = ratings[ratings['userId']==user].sort_values(by=['rating'], ascending=False)
user_favorites = user_ratings.head(5)
user_favorites.merge(movies, on=['movieId'])
user_rating = r_pred[user]
ratings_np = user_rating.detach().numpy()
top5_recos = np.flip(np.argsort(ratings_np))[:5]
top5_recos

not_watched = movies[~movies['movieId'].isin(user_ratings['movieId'])]
not_watched[not_watched['movieId'].isin(top5_recos)]

Unnamed: 0,movieId,title,genres
13671,13397,"Days Between, The (In den Tag hinein) (2001)",Drama
18343,17477,"New Life, A (La vie nouvelle) (2002)",Drama
19497,18396,Pirates of the Great Salt Lake (2006),Adventure|Comedy
26118,22762,Always for Pleasure (1978),(no genres listed)
26293,22809,Marihuana (1936),Documentary|Drama


In [36]:
user_favorites.merge(movies, on=['movieId'])

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,0,7853,5.0,1094786027,Freaks (1932),Crime|Drama|Horror
1,0,4895,5.0,1112484682,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
2,0,5851,5.0,1112484619,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
3,0,7037,5.0,1112484633,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4,0,1171,4.5,1112484742,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi


In [None]:
# Save the model to a separate folder
torch.save(model.state_dict(), 'models/mf_k3.pth')
