In [1]:
import time
from tqdm.notebook import trange
import random
import wget
import zipfile
import os
import warnings

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
warnings.filterwarnings("ignore")

In [2]:
ZIP_FILE = 'ml-1m.zip'
DATA_URL = 'https://files.grouplens.org/datasets/movielens/'\
            '{}'.format(ZIP_FILE)
DATA_PATH = './ml-1m/ratings.dat'
DEVICE = 'cpu'

In [3]:
if not os.path.exists(DATA_PATH):
    wget.download(DATA_URL)
    with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
        zip_ref.extractall('./')
else:
    pass

подготовка формата для pytorch

In [4]:
class UserItemRatingDataset(Dataset):
    def __init__(self, user:list, item:list, rating:list):
        super(UserItemRatingDataset, self).__init__()
        
        self.user = torch.tensor(user, dtype=torch.long)
        self.item = torch.tensor(item, dtype=torch.long)
        self.target = torch.tensor(rating, dtype=torch.long)

    def __len__(self):
        return len(self.target)
        
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.target[idx]


In [5]:
class NCFData(object):
    def __init__(self,
                 ratings,
                 num_negatives,
                 num_negatives_test,
                 batch_size:int):
        self.ratings = ratings
        self.num_negatives = num_negatives
        self.num_negatives_test = num_negatives_test
        self.batch_size = batch_size

        self.preprocess_ratings = self._reindex(self.ratings)
        self.user_pool = set(self.ratings['user_id'].unique())
        self.item_pool = set(self.ratings['item_id'].unique())

        self.train_ratings, self.test_ratings =\
            self._leave_one_out(self.preprocess_ratings)
        self.negatives =\
            self._negative_sampling(self.preprocess_ratings)
        
    def _reindex(self, ratings):
        user = list(ratings['user_id'].drop_duplicates())
        self.user2id = {w: i for i, w in enumerate(user)}

        item = list(ratings['item_id'].drop_duplicates())
        self.item2id = {w: i for i, w in enumerate(item)}

        ratings['user_id'] = ratings['user_id'].\
            apply(lambda x: self.user2id[x])
        ratings['item_id'] = ratings['item_id'].\
            apply(lambda x: self.item2id[x])
        ratings['rating'] = ratings['rating'].\
            apply(lambda x: float(x > 0))
        return ratings
        
    def _leave_one_out(self, ratings):
        ratings['rank_latest'] =\
            ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=True)
        test = ratings.loc[ratings['rank_latest'] == 1]
        train = ratings.loc[ratings['rank_latest'] > 1]
        return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]

    def _negative_sampling(self, ratings):
        interact_status = (
            ratings.groupby('user_id')['item_id']
            .apply(set)
            .reset_index()
            .rename(columns={'item_id': 'interacted_items'})
        )
        interact_status['negative_items'] = (
            interact_status['interacted_items'].\
                apply(lambda x: self.item_pool - x)
        )
        interact_status['negative_samples'] = (
            interact_status['negative_items'].\
                apply(lambda x: random.sample(x, self.num_negatives_test))
        )
        return interact_status[['user_id', 'negative_items', 'negative_samples']]
    
    def get_train_instance(self):
        users, items, ratings = [], [], []
        train_ratings = pd.merge(self.train_ratings, 
                                 self.negatives[['user_id', 'negative_items']],
                                 on='user_id')
        train_ratings['negatives'] = (
            train_ratings['negative_items'].\
                apply(lambda x: random.sample(x, self.num_negatives))
        )

        for row in train_ratings.itertuples():
            users.append(int(row.user_id))
            items.append(int(row.item_id))
            ratings.append(float(row.rating))
            
        dataset = UserItemRatingDataset(user=users,
                                        item=items,
                                        rating=ratings)
        return DataLoader(dataset,
                         batch_size=self.batch_size,
                         shuffle=True, num_workers=4)
        
    def get_test_instance(self):
        users, items, ratings = [], [], []
        test_ratings = pd.merge(self.test_ratings, 
                                 self.negatives[['user_id', 'negative_items']],
                                 on='user_id')

        for row in test_ratings.itertuples():
            users.append(int(row.user_id))
            items.append(int(row.item_id))
            ratings.append(float(row.rating))
            
        dataset = UserItemRatingDataset(user=users,
                                        item=items,
                                        rating=ratings)
        return DataLoader(dataset,
                            batch_size=self.batch_size,
                            shuffle=True, num_workers=4)

In [7]:
ml_1m = pd.read_csv(DATA_PATH, sep='::', engine='python',
                    names=['user_id', 'item_id', 'rating', 'timestamp'])

In [8]:
num_users = ml_1m['user_id'].unique()+1
num_items = ml_1m['item_id'].unique()+1

In [9]:
data = NCFData(ml_1m, num_negatives=4,
           num_negatives_test=100, batch_size=2048)

In [10]:
def hit(ng_item, pred_items):
    if ng_item in pred_items:
        return 1
    return 0

def ndcg(ng_item, pred_items):
    if ng_item in pred_items:
        index = pred_items.index(ng_item)
        return np.reciprocal(np.log2(index+2))
    return 0

def mrr(ng_item, pred_items):
    if ng_item in pred_items:
        index = pred_items.index(ng_item) + 1
        return 1/index
    return 0

@torch.no_grad()
def metrics(model, test_loader, top_k, device):
    _hr, _ndcg, _mrr = [], [], []

    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)

        predictions = model(user, item)
        predictions = predictions.view(-1)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        ng_item = item[0].item()
        _hr.append(hit(ng_item, recommends))
        _ndcg.append(ndcg(ng_item, recommends))
        _mrr.append(mrr(ng_item, recommends))
    
    return np.mean(_hr), np.mean(_ndcg), np.mean(_mrr)

### Архитектура (GMF)

In [11]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim

        self.embedding_user = nn.Embedding(
            num_embeddings=num_users, 
            embedding_dim=embedding_dim)
        self.embedding_item = nn.Embedding(
            num_embeddings=num_items,
            embedding_dim=embedding_dim
        )
        self.affine_output = nn.Linear(
            in_features=embedding_dim,
            out_features=1
        )
        self.activation = nn.Sigmoid()

        nn.init.xavier_uniform_(self.embedding_user.weight)
        nn.init.xavier_uniform_(self.embedding_item.weight)

    def forward(self, user_indices, item_indeces):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indeces)
        element_product = torch.mul(user_embedding,
                                    item_embedding)
        logits = self.affine_output(element_product)
        rating = self.activation(logits)
        return rating

In [12]:
def train_pipeline(model,
                   optimizer,
                   criterion,
                   data,
                   num_epoch):
    loss_history = []
    metrics_history = {'HR@10': [], 'NDCG@10': []}
    test_loader = data.get_test_instance()
    
    for epoch in trange(num_epoch):
        model.train()
        
        train_loader = data.get_train_instance()

        for user, item, label in train_loader:
            user = user.to(DEVICE)
            item = item.to(DEVICE)
            label = label.to(DEVICE)

            optimizer.zero_grad()
            prediction = model(user, item)
            
            loss = criterion(prediction.view(-1).to(torch.float64), 
                            label.to(torch.float64))
            loss.backward()
            optimizer.step()

            loss_history.append(loss.item())

        model.eval()
        hr_i, ndcg_i, mrr_i = metrics(model, test_loader, 10, DEVICE)
        metrics_history['HR@10'].append(hr_i)
        metrics_history['NDCG@10'].append(ndcg_i)

        print(f"[Epoch {epoch}]| Loss: {loss.item():.5f}\n"\
              f"HR@10: {hr_i:.3f}\tMRR@10: {mrr_i:.3f}\tNDCG@10: {ndcg_i:.3f} |")

    return loss_history, metrics_history

In [13]:
model = GMF(num_users=len(num_users), num_items=len(num_items), embedding_dim=32)
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [14]:
loss_history, metrics_history = train_pipeline(model, optimizer, criterion, data, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

[Epoch 0]| Loss: 0.03803
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |
[Epoch 1]| Loss: 0.00439
HR@10: 0.333	MRR@10: 0.048	NDCG@10: 0.111 |
[Epoch 2]| Loss: 0.00103
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |
[Epoch 3]| Loss: 0.00050
HR@10: 0.333	MRR@10: 0.111	NDCG@10: 0.167 |
[Epoch 4]| Loss: 0.00035
HR@10: 0.333	MRR@10: 0.037	NDCG@10: 0.100 |
[Epoch 5]| Loss: 0.00016
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |
[Epoch 6]| Loss: 0.00017
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |
[Epoch 7]| Loss: 0.00011
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |
[Epoch 8]| Loss: 0.00007
HR@10: 0.333	MRR@10: 0.033	NDCG@10: 0.096 |
[Epoch 9]| Loss: 0.00003
HR@10: 0.000	MRR@10: 0.000	NDCG@10: 0.000 |


### Архитектура MLP

In [15]:
class MLP(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 embedding_dim,
                 layers):
        super(MLP, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embeddings_dim = embedding_dim
        self.layers = layers

        self.embedding_user = nn.Embedding(
            num_embeddings=self.num_users,
            embedding_dim=self.embeddings_dim
        )
        self.embedding_item = nn.Embedding(
            num_embeddings=self.num_items,
            embedding_dim=self.embeddings_dim
        )
        
        self.fc1 = nn.Linear(self.embeddings_dim * 2, layers[0])
        self.fc2 = nn.Linear(layers[0], layers[1])
        self.fc3 = nn.Linear(layers[1], layers[2])
        self.fc4 = nn.Linear(layers[2], layers[3])
        self.fc5 = nn.Linear(layers[3], layers[4])
        self.fc6 = nn.Linear(layers[4], layers[5])

        self.affine_output = nn.Linear(self.layers[-1], 1)
        self.activation_layer = nn.ReLU()
        self.activation = nn.Sigmoid()

        nn.init.xavier_uniform_(self.embedding_user.weight)
        nn.init.xavier_uniform_(self.embedding_item.weight)

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_concat = torch.cat((user_embedding,
                                    item_embedding), -1)
        layer1 = self.activation_layer(self.fc1(element_concat))
        layer2 = self.activation_layer(self.fc2(layer1))
        layer3 = self.activation_layer(self.fc3(layer2))
        layer4 = self.activation_layer(self.fc4(layer3))
        layer5 = self.activation_layer(self.fc5(layer4))
        layer6 = self.activation_layer(self.fc6(layer5))

        logits = self.affine_output(layer6)
        rating = self.activation(logits)

        return rating
        

In [16]:
model = MLP(num_users=len(num_users),
            num_items=len(num_items),
            embedding_dim=128,
            layers=[128, 64, 32, 16, 8, 4])
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [17]:
summary(model, [(128,), (128,)], dtypes=[torch.long, torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 128, 128]            773,120
├─Embedding: 1-2                         [-1, 128, 128]            474,368
├─Linear: 1-3                            [-1, 128, 128]            32,896
├─ReLU: 1-4                              [-1, 128, 128]            --
├─Linear: 1-5                            [-1, 128, 64]             8,256
├─ReLU: 1-6                              [-1, 128, 64]             --
├─Linear: 1-7                            [-1, 128, 32]             2,080
├─ReLU: 1-8                              [-1, 128, 32]             --
├─Linear: 1-9                            [-1, 128, 16]             528
├─ReLU: 1-10                             [-1, 128, 16]             --
├─Linear: 1-11                           [-1, 128, 8]              136
├─ReLU: 1-12                             [-1, 128, 8]              --
├─Linear: 1-13                           [-1, 128, 4]          

Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 128, 128]            773,120
├─Embedding: 1-2                         [-1, 128, 128]            474,368
├─Linear: 1-3                            [-1, 128, 128]            32,896
├─ReLU: 1-4                              [-1, 128, 128]            --
├─Linear: 1-5                            [-1, 128, 64]             8,256
├─ReLU: 1-6                              [-1, 128, 64]             --
├─Linear: 1-7                            [-1, 128, 32]             2,080
├─ReLU: 1-8                              [-1, 128, 32]             --
├─Linear: 1-9                            [-1, 128, 16]             528
├─ReLU: 1-10                             [-1, 128, 16]             --
├─Linear: 1-11                           [-1, 128, 8]              136
├─ReLU: 1-12                             [-1, 128, 8]              --
├─Linear: 1-13                           [-1, 128, 4]          

In [18]:
loss_history_mlp, metrics_history_mlp =\
    train_pipeline(model, optimizer, criterion, data, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

[Epoch 0]| Loss: 0.00001
HR@10: 0.333	MRR@10: 0.167	NDCG@10: 0.210 |
[Epoch 1]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.667	NDCG@10: 0.667 |
[Epoch 2]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.114	NDCG@10: 0.240 |
[Epoch 3]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.167	NDCG@10: 0.285 |
[Epoch 4]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.144	NDCG@10: 0.263 |
[Epoch 5]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.200	NDCG@10: 0.383 |
[Epoch 6]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.089	NDCG@10: 0.216 |
[Epoch 7]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.079	NDCG@10: 0.205 |
[Epoch 8]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.089	NDCG@10: 0.216 |
[Epoch 9]| Loss: 0.00000
HR@10: 0.667	MRR@10: 0.103	NDCG@10: 0.230 |


### NeuMF

In [19]:
class NeuMF(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 embedding_dim,
                 layers):
        super(NeuMF, self).__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.layers = layers

        self.embedding_user_mlp = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mlp = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.embedding_user_mf = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mf = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim * 2, self.layers[0]),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(self.layers[0], self.layers[1]),
            nn.Dropout(p=0.4),
            nn.ReLU(),
            nn.Linear(self.layers[1], self.layers[2]),
            nn.ReLU(),
            nn.Linear(self.layers[2], self.layers[3])
        )

        self.affine_output = nn.Linear(
            self.layers[-1] + self.embedding_dim, 1
        )
        self.activate = nn.Sigmoid()
        
        # nn.init.xavier_uniform_(self.embedding_user_mlp.weight)
        # nn.init.xavier_uniform_(self.embedding_item_mlp.weight)

        # nn.init.xavier_uniform_(self.embedding_user_mf.weight)
        # nn.init.xavier_uniform_(self.embedding_item_mf.weight)

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)

        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        element_product_mf = torch.mul(
            user_embedding_mf,
            item_embedding_mf
        )

        element_product_mlp = torch.cat(
            (user_embedding_mlp, item_embedding_mlp), -1
        )

        layers = self.fc(element_product_mlp)

        logits = self.affine_output(torch.cat(
            (layers, element_product_mf), -1)
        )
        rating = self.activate(logits)

        return rating
        

In [20]:
model = NeuMF(num_users=len(num_users),
              num_items=len(num_items),
              embedding_dim=128,
              layers=[32, 16, 8, 4])
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [21]:
summary(model, [(128,), (128,)], dtypes=[torch.long, torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 128, 128]            773,248
├─Embedding: 1-2                         [-1, 128, 128]            474,496
├─Embedding: 1-3                         [-1, 128, 128]            773,248
├─Embedding: 1-4                         [-1, 128, 128]            474,496
├─Sequential: 1-5                        [-1, 128, 4]              --
|    └─Linear: 2-1                       [-1, 128, 32]             8,224
|    └─Dropout: 2-2                      [-1, 128, 32]             --
|    └─ReLU: 2-3                         [-1, 128, 32]             --
|    └─Linear: 2-4                       [-1, 128, 16]             528
|    └─Dropout: 2-5                      [-1, 128, 16]             --
|    └─ReLU: 2-6                         [-1, 128, 16]             --
|    └─Linear: 2-7                       [-1, 128, 8]              136
|    └─ReLU: 2-8                         [-1, 128, 8]       

Layer (type:depth-idx)                   Output Shape              Param #
├─Embedding: 1-1                         [-1, 128, 128]            773,248
├─Embedding: 1-2                         [-1, 128, 128]            474,496
├─Embedding: 1-3                         [-1, 128, 128]            773,248
├─Embedding: 1-4                         [-1, 128, 128]            474,496
├─Sequential: 1-5                        [-1, 128, 4]              --
|    └─Linear: 2-1                       [-1, 128, 32]             8,224
|    └─Dropout: 2-2                      [-1, 128, 32]             --
|    └─ReLU: 2-3                         [-1, 128, 32]             --
|    └─Linear: 2-4                       [-1, 128, 16]             528
|    └─Dropout: 2-5                      [-1, 128, 16]             --
|    └─ReLU: 2-6                         [-1, 128, 16]             --
|    └─Linear: 2-7                       [-1, 128, 8]              136
|    └─ReLU: 2-8                         [-1, 128, 8]       

In [22]:
loss_history_neumf, metrics_history_neumf =\
    train_pipeline(model, optimizer, criterion, data, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

[Epoch 0]| Loss: 0.00017
HR@10: 0.333	MRR@10: 0.056	NDCG@10: 0.119 |
[Epoch 1]| Loss: 0.00005
HR@10: 0.667	MRR@10: 0.095	NDCG@10: 0.222 |
[Epoch 2]| Loss: 0.00001
HR@10: 1.000	MRR@10: 0.145	NDCG@10: 0.334 |
[Epoch 3]| Loss: 0.00001
HR@10: 1.000	MRR@10: 0.125	NDCG@10: 0.315 |
[Epoch 4]| Loss: 0.00002
HR@10: 1.000	MRR@10: 0.194	NDCG@10: 0.377 |
[Epoch 5]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.131	NDCG@10: 0.321 |
[Epoch 6]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.120	NDCG@10: 0.311 |
[Epoch 7]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.125	NDCG@10: 0.315 |
[Epoch 8]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.125	NDCG@10: 0.315 |
[Epoch 9]| Loss: 0.00000
HR@10: 1.000	MRR@10: 0.125	NDCG@10: 0.315 |


------