In [1]:
!python -m pip install ml_metrics

Collecting ml_metrics
  Downloading ml_metrics-0.1.4.tar.gz (5.0 kB)
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25ldone
[?25h  Created wheel for ml-metrics: filename=ml_metrics-0.1.4-py3-none-any.whl size=7844 sha256=64666abeb3d0e3c053380dc33e8580a115a4fe6239401464b56bce42a3e678cd
  Stored in directory: /home/shoney/.cache/pip/wheels/c6/b2/69/0691b3d4de7c8d0c604cd7de94f4b2d4478a04ce3b6e4bab15
Successfully built ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4
You should consider upgrading via the '/home/shoney/devnlp/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import torch
import torch.nn.functional as F

from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import heapq
import math

import scipy.sparse as sp
import numpy as np
import pandas as pd

import ml_metrics as metrics

https://grouplens.org/datasets/movielens/

In [3]:
movie_data = pd.read_csv('datasets/movies/ml-latest-small/ratings.csv', 
                         sep=',', header=0)

movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movie_data.shape

(100836, 4)

In [5]:
NUM_USERS = movie_data['userId'].max()

NUM_USERS

610

In [6]:
NUM_ITEMS = movie_data['movieId'].max()

NUM_ITEMS

193609

In [7]:
TEST_USER_IDS = [5, 24, 45, 67, 89, 123, 239, 345, 456, 598]

In [8]:
test_movie_users = movie_data[movie_data['userId'].isin(TEST_USER_IDS)]

test_movie_users.head()

Unnamed: 0,userId,movieId,rating,timestamp
516,5,1,4.0,847434962
517,5,21,4.0,847435238
518,5,34,4.0,847434881
519,5,36,4.0,847435292
520,5,39,3.0,847434961


In [9]:
def load_ratings_matrix(movie_data):

    ratings_matrix = sp.dok_matrix((NUM_USERS + 1, NUM_ITEMS + 1), dtype=np.float32)

    for index, row in movie_data.iterrows():
        user, item, rating = int(row['userId']), int(row['movieId']), float(row['rating'])
        
        ratings_matrix[user, item] = rating
    
    random_user = np.random.randint(1, NUM_USERS)
    
    return ratings_matrix

In [10]:
ratings_matrix = load_ratings_matrix(movie_data)

ratings_matrix.shape

(611, 193610)

In [11]:
class RecommenderNN(nn.Module):

    def __init__(self, n_users, n_items, layers=[24, 16], dropout=0.2):

        super().__init__()

        assert (layers[0] % 2 == 0), "layers[0] must be an even number"

        self.dropout = dropout

        embedding_dim = int(layers[0] / 2)

        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)

        self.fc_layers = torch.nn.ModuleList()

        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        # Output of the last layer is just 1 for predicting ratings values
        self.output_layer = torch.nn.Linear(layers[-1], 1)

    def forward(self, users, items):
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)

        # Concatenate user and item embeddings, this is the input to the NN
        x = torch.cat([user_embedding, item_embedding], 1)
        
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x,  p=self.dropout, training=self.training)
        
        rating = self.output_layer(x)

        return rating

    def predict(self, users, items):
        output_scores = self.forward(users, items)

        return output_scores.cpu().detach().numpy()

In [12]:
def generate_training_instances(ratings_matrix):
    
    user_item_ratings = {}
    index = 0
    
    for user, item in ratings_matrix.keys():
        
        user_item_ratings[index] = (user, item, ratings_matrix[user, item])
        index += 1
    
    return user_item_ratings

In [13]:
train_user_item_ratings = generate_training_instances(ratings_matrix)

len(train_user_item_ratings)

100836

In [14]:
train_user_item_ratings[0], train_user_item_ratings[3]

((1, 1, 4.0), (1, 47, 5.0))

In [15]:
def train(model, train_data_loader, criterion, optimizer, epoch):
    
    model.train()
    
    epoch_loss = []
    
    for users_items_rating in train_data_loader:
        
        users, items, ratings = users_items_rating
        
        predictions = model(users, items)
        
        # Convert to float and change dim from [batch_size] to [batch_size, 1]
        ratings = ratings.float().view(predictions.size())
        
        loss = criterion(predictions, ratings)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        
        epoch_loss.append(loss.item())
        
    epoch_loss = np.mean(epoch_loss)

    print("Epoch completed", epoch)
    
    print("Train Loss: {%.4f}" % (epoch_loss))

In [16]:
def load_zero_rated(ratings_matrix, user_id, user_item_ratings): 
    
    for i in range(100):
        potential_zero_item = np.random.randint(1, NUM_ITEMS)
        
        while (user_id, potential_zero_item) in ratings_matrix:
            potential_zero_item = np.random.randint(1, NUM_ITEMS)
            
        user_item_ratings['users'] = \
            np.append(user_item_ratings['users'], np.array([user_id]))
        
        user_item_ratings['items'] = \
            np.append(user_item_ratings['items'], np.array([potential_zero_item]))
        
        user_item_ratings['ratings'] = \
            np.append(user_item_ratings['ratings'], np.array([0]))

In [17]:
def generate_test_instances(ratings_matrix, test_movie_users):

    test_list = []

    for user_id in TEST_USER_IDS:
        
        user_item_ratings_df = test_movie_users[test_movie_users['userId'] == user_id]
        user_item_ratings_df = user_item_ratings_df[user_item_ratings_df['rating'] >= 4]
        
        user_item_ratings = {}
        
        user_item_ratings['users'] = user_item_ratings_df['userId'].values[5:15]
        user_item_ratings['items'] = user_item_ratings_df['movieId'].values[5:15]
        user_item_ratings['ratings'] = user_item_ratings_df['rating'].values[5:15]
        
        load_zero_rated(ratings_matrix, user_id, user_item_ratings)
        
        test_list.append(user_item_ratings)
        
    return test_list

In [18]:
test_list = generate_test_instances(ratings_matrix, test_movie_users)

len(test_list)

10

In [19]:
test_list[0]

{'users': array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]),
 'items': array([    58,    110,    232,    247,    261,    290,    296,    367,
           457,    474,  41345, 182631,  18197, 161233,   1394,  28011,
        153917, 106034,  66187,  75510,  49438, 113499, 147983,  87947,
          1317,  74295,  35966, 161110,  32435, 173278,  79787,  10751,
        193599,  29238,  75645,  76407,  63576,  24895, 101984,  16391,
        142290, 173458,   9300,  35324, 107225, 140507,  37249,   3218,
        160625, 165951, 179351, 119584,  88598,  53641, 134842,  34138,
         95809,   8508, 182036,  93082,    682,  78318, 147539,  74603,
        103683,   5886,  91604,   

In [20]:
def evaluate(model, test_list):

    model.eval()
    
    apks = []

    for user_item_ratings in test_list:
        
        users = torch.tensor(user_item_ratings['users'])
        items = torch.tensor(user_item_ratings['items'])
        ratings = user_item_ratings['ratings']

        predictions = model.predict(users, items)
        
        item_score_map = {}
        
        for i, item in enumerate(user_item_ratings['items']):
            item_score_map[item] = predictions[i]
        
        rank_list = heapq.nlargest(100, item_score_map, key=item_score_map.get)
        
        items_list = items.detach().numpy().tolist()
        rank_list = list(rank_list)

        apk = metrics.apk(items_list[:10], rank_list[:10])
        
        apks.append(apk)
        
    
    print("Evaluation mean APK : {%.4f}" % np.mean(apks))

In [21]:
model = RecommenderNN(NUM_USERS + 1, NUM_ITEMS + 1, [32, 16, 8], dropout=0.2)

criterion = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)

num_epochs = 15

In [22]:
train_data_loader = DataLoader(
    train_user_item_ratings, batch_size=100, shuffle=True, num_workers=0)

In [23]:
for epoch in range(1, num_epochs + 1):
    
    train(model, train_data_loader, criterion, optimizer, epoch)
    evaluate(model, test_list)

Epoch completed 1
Train Loss: {3.8352}
Evaluation mean APK : {0.3372}
Epoch completed 2
Train Loss: {1.9782}
Evaluation mean APK : {0.5797}
Epoch completed 3
Train Loss: {1.6262}
Evaluation mean APK : {0.6027}
Epoch completed 4
Train Loss: {1.3538}
Evaluation mean APK : {0.6400}
Epoch completed 5
Train Loss: {1.1500}
Evaluation mean APK : {0.6572}
Epoch completed 6
Train Loss: {1.0243}
Evaluation mean APK : {0.6789}
Epoch completed 7
Train Loss: {0.9205}
Evaluation mean APK : {0.6739}
Epoch completed 8
Train Loss: {0.8565}
Evaluation mean APK : {0.6737}
Epoch completed 9
Train Loss: {0.8117}
Evaluation mean APK : {0.6656}
Epoch completed 10
Train Loss: {0.7896}
Evaluation mean APK : {0.6895}
Epoch completed 11
Train Loss: {0.7712}
Evaluation mean APK : {0.6838}
Epoch completed 12
Train Loss: {0.7581}
Evaluation mean APK : {0.6748}
Epoch completed 13
Train Loss: {0.7449}
Evaluation mean APK : {0.6870}
Epoch completed 14
Train Loss: {0.7389}
Evaluation mean APK : {0.6726}
Epoch completed