In [59]:
import time
from tqdm.notebook import trange, tqdm
import random
import zipfile
import os
import warnings

import numpy as np
# import pandas as pd
import modin.pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# from torchsummary import summary
warnings.filterwarnings("ignore")

In [60]:
ZIP_FILE = 'ml-10m.zip'
DATA_URL = 'https://files.grouplens.org/datasets/movielens/'\
            '{}'.format(ZIP_FILE)
DATA_PATH = '/kaggle/input/movielens-1m-dataset/ratings.dat'
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [None]:
if not os.path.exists(DATA_PATH):
    wget.download(DATA_URL)
    with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
        zip_ref.extractall('./')
else:
    pass

----

подготовка формата для pytorch

In [61]:
class UserItemRatingDataset(Dataset):
    def __init__(self, user:list, item:list, rating:list):
        super(UserItemRatingDataset, self).__init__()
        
        self.user = torch.tensor(user, dtype=torch.long)
        self.item = torch.tensor(item, dtype=torch.long)
        self.target = torch.tensor(rating, dtype=torch.float)

    def __len__(self):
        return len(self.target)
        
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.target[idx]


In [62]:
class NCFData(object):
    def __init__(self, ratings, num_negatives, num_negatives_test, batch_size: int):
        self.ratings = ratings
        self.num_negatives = num_negatives
        self.num_negatives_test = num_negatives_test
        self.batch_size = batch_size

        self.preprocess_ratings = self._reindex(self.ratings)
        self.user_pool = set(self.ratings["user_id"].unique())
        self.item_pool = set(self.ratings["item_id"].unique())

        self.train_ratings, self.test_ratings = self._leave_p_out(
            self.preprocess_ratings
        )
        self.negatives = self._negative_sampling(self.preprocess_ratings)

    def _reindex(self, ratings):
        user = list(ratings["user_id"].drop_duplicates())
        self.user2id = {w: i for i, w in enumerate(user)}

        item = list(ratings["item_id"].drop_duplicates())
        self.item2id = {w: i for i, w in enumerate(item)}

        ratings["user_id"] = ratings["user_id"].apply(lambda x: self.user2id[x])
        ratings["item_id"] = ratings["item_id"].apply(lambda x: self.item2id[x])
        ratings["rating"] = ratings["rating"].apply(lambda x: float(x > 0))

        return ratings

    def _leave_one_out(self, ratings):
        ratings["rank_latest"] = ratings.groupby(["user_id"])["timestamp"].rank(
            method="first", ascending=False
        )
        test = ratings.loc[ratings["rank_latest"] == 1]
        train = ratings.loc[ratings["rank_latest"] > 1]
        
        return train[["user_id", "item_id", "rating"]], test[["user_id", "item_id", "rating"]]
    
    def _leave_p_out(self, ratings, p=10):
        ratings["rank_latest"] = ratings.groupby(["user_id"])["timestamp"].rank(
            method="first", ascending=False
        )
        test = ratings.loc[ratings["rank_latest"] < p+1]
        train = ratings.loc[ratings["rank_latest"] > p]
        return train[["user_id", "item_id", "rating"]], test[["user_id", "item_id", "rating"]]

    def _negative_sampling(self, ratings):
        interact_status = (
            ratings.groupby('user_id')['item_id']
            .apply(set)
            .reset_index()
            .rename(columns={'item_id': 'interacted_items'})
        )

        interact_status["negative_samples"] = [
            random.sample(
                self.item_pool - interact_status["interacted_items"].iloc[i],
                self.num_negatives_test,
            )
            for i in range(interact_status["interacted_items"].shape[0])
        ]

        return interact_status[['user_id', 'negative_samples', 'interacted_items']]

    def get_train_instance(self):
        users, items, ratings = [], [], []        
        train_ratings_negatives = pd.DataFrame()
        train_ratings_negatives['user_id'] = self.negatives['user_id']
        train_ratings_negatives["negatives"] = [
            random.sample(
                self.item_pool - self.negatives['interacted_items'].iloc[i],
                len(self.negatives['interacted_items'].iloc[i]) if len(self.item_pool) - 2 * len(self.negatives['interacted_items'].iloc[i]) > 0 else self.num_negatives,
            )
            for i in range(self.negatives['interacted_items'].shape[0])
        ]
        train_ratings_negatives = train_ratings_negatives.explode("negatives")

        users = np.append(self.train_ratings["user_id"], 
                          train_ratings_negatives["user_id"]).astype(np.int64)
        items = np.append(self.train_ratings["item_id"], 
                          train_ratings_negatives["negatives"]).astype(np.int64)
        ratings = np.append(self.train_ratings["rating"], 
                            [0 for i in range(train_ratings_negatives.shape[0])]).astype(np.int64)
        
        assert len(users) == len(items) and len(items) == len(ratings)

        dataset = UserItemRatingDataset(user=users, item=items, rating=ratings)
        return DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def get_test_instance(self, p=10):
        users, items, ratings = [], [], []
        test_ratings = pd.merge(
            self.test_ratings,
            self.negatives[["user_id", "negative_samples"]],
            on="user_id",
        )
        
        for user in tqdm(np.unique(test_ratings['user_id'])):
            for row in (test_ratings.loc[test_ratings['user_id'] == user]).itertuples():
                users.append(int(row.user_id))
                items.append(int(row.item_id))
                ratings.append(float(row.rating))
            for item_negative in test_ratings.loc[test_ratings['user_id'] == user]['negative_samples'].iloc[0]:
                users.append(int(user))
                items.append(int(item_negative))
                ratings.append(float(0))

        dataset = UserItemRatingDataset(user=users, item=items, rating=ratings)
        return DataLoader(
            dataset, batch_size=self.num_negatives_test+p, shuffle=False, num_workers=4
        )

In [63]:
def data2excel(name_excel:str, loss:list, metrics:dict):
    df = pd.DataFrame({
        "eposh": [i for i in range(1, len(loss)+1)],
        "loss": loss,
        "test_loss": metrics["Test_loss"],
        "HR@10": metrics["HR@10"],
        "Precision@10": metrics["Precision@10"],
        "Recall@10": metrics["Recall@10"],
        "MAP@10": metrics["MAP@10"],
        "NDCG@10": metrics["NDCG@10"],
        "MRR@10": metrics["MRR@10"],
        
    })
    
    df.to_excel(f"{name_excel}.xlsx")
    
    return df

In [64]:
# ml_1m = pd.read_csv(DATA_PATH, sep='::', engine='python',
#                     names=['user_id', 'item_id', 'rating', 'timestamp'])
ml_10m = pd.read_parquet('./ratings_10m.parquet')

In [65]:
ml_10m.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [66]:
ml_10m[["user_id", "item_id"]].groupby(['user_id']).agg('count').mean()

item_id    143.10733
dtype: float64

In [67]:
num_users = ml_10m['user_id'].nunique()
num_items = ml_10m['item_id'].nunique()

In [68]:
data = NCFData(ml_10m, num_negatives=4,
           num_negatives_test=100, batch_size=2048)

2023-05-23 19:04:01,636 - distributed.core - ERROR - Exception while handling op scatter
Traceback (most recent call last):
  File "/home/laptopml/anaconda3/lib/python3.10/site-packages/distributed/core.py", line 769, in _handle_comm
    result = await result
  File "/home/laptopml/anaconda3/lib/python3.10/site-packages/distributed/scheduler.py", line 5023, in scatter
    await self.replicate(keys=keys, workers=workers, n=n)
  File "/home/laptopml/anaconda3/lib/python3.10/site-packages/distributed/scheduler.py", line 5781, in replicate
    for ws in random.sample(tuple(workers - ts.who_has), count):
  File "/home/laptopml/anaconda3/lib/python3.10/random.py", line 482, in sample
    raise ValueError("Sample larger than population or is negative")
ValueError: Sample larger than population or is negative
2023-05-23 19:04:03,342 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-76v9nk42', purging
Exception ignored in: <function _MultiProces

ValueError: Sample larger than population or is negative

In [8]:
def hit(y_pred, y_true):
    hr = set(y_pred).intersection({y_true})
    if len(hr) != 0:
        return 1
    return 0

def precision(y_pred, y_true, p=10):
    return len((set(y_pred).intersection({y_true}))) / p

def recall(y_pred, y_true):
    return len(set(y_pred).intersection({y_true}))/len({y_true})

def mrr(y_pred, y_true):
    for i in range(len(y_pred)):
        if y_pred[i] == y_true:
            return 1 / (y_pred.index(y_true) + 1)
    return 0

def map_k(y_pred, y_true):
    relevances = []
    precisions = []

    for i in range(len(y_pred)):
        if y_pred[i] == y_true:
            relevances.append(1)
            precisions.append(sum(relevances)/(i + 1))
    
    if len(precisions) > 0:
        return np.mean(precisions)
    
    return 0

def ndcg(y_pred, y_true):
    if y_true in y_pred:
        index = y_pred.index(y_true)
        return np.reciprocal(np.log2(index + 2))
    return 0

@torch.no_grad()
def metrics(model, test_loader, criterion, top_k, device):
    _hr, _precision, _recall, _mrr, _map, _ndcg = [], [], [], [], [], []

    test_loss = []
    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)
        label = label.to(device)

        predictions = model(user, item)
        predictions = predictions.view(-1)
        loss = criterion(predictions.to(torch.float64), 
                            label.to(torch.float64))
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        y_true = item[0].item()
        _hr.append(hit(recommends, y_true))
        _precision.append(precision(recommends, y_true))
        _recall.append(recall(recommends, y_true))
        _mrr.append(mrr(recommends, y_true))
        _map.append(map_k(recommends, y_true))
        _ndcg.append(ndcg(recommends, y_true))
        test_loss.append(loss.cpu().numpy())
    
    return np.mean(test_loss), np.mean(_hr), np.mean(_precision), np.mean(_recall), np.mean(_mrr), np.mean(_map), np.mean(_ndcg)

### Архитектура (GMF)

In [25]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        self.num_users = num_users+1
        self.num_items = num_items+1
        self.embedding_dim = embedding_dim

        self.embedding_user = nn.Embedding(
            num_embeddings=num_users+1, 
            embedding_dim=embedding_dim)
        self.embedding_item = nn.Embedding(
            num_embeddings=num_items+1,
            embedding_dim=embedding_dim
        )

        self.affine_output = nn.Linear(
            in_features=embedding_dim,
            out_features=1
        )
        self.activation = nn.Sigmoid()

        nn.init.xavier_uniform_(self.embedding_user.weight)
        nn.init.xavier_uniform_(self.embedding_item.weight)

    def forward(self, user_indices, item_indeces):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indeces)
        element_product = torch.mul(user_embedding,
                                    item_embedding)
        logits = self.affine_output(element_product)
        rating = self.activation(logits)
        return rating

In [26]:
def train_pipeline(model,
                   optimizer,
                   criterion,
                   data,
                   num_epoch):
    loss_history_epoch = []
    metrics_history = {"Test_loss": [], "HR@10": [], "Precision@10": [], "Recall@10": [],
                      "MRR@10": [], "MAP@10": [], "NDCG@10": []}
    test_loader = data.get_test_instance()
    
    for epoch in trange(num_epoch):
        loss_history = []
        model.train()
        
        train_loader = data.get_train_instance()

        for user, item, label in train_loader:
            user = user.to(DEVICE)
            item = item.to(DEVICE)
            label = label.to(DEVICE)

            optimizer.zero_grad()
            prediction = model(user, item)
            
            loss = criterion(prediction.view(-1).to(torch.float64), 
                            label.to(torch.float64))
            loss.backward()
            optimizer.step()

            loss_history.append(loss.item())

        model.eval()
        test_loss, hr_i, precision_i, recall_i, mrr_i, map_i, ndcg_i = metrics(model, test_loader, criterion, 10, DEVICE)
        metrics_history['Test_loss'].append(test_loss)
        metrics_history['HR@10'].append(hr_i)
        metrics_history['Precision@10'].append(precision_i)
        metrics_history['Recall@10'].append(recall_i)
        metrics_history['MRR@10'].append(mrr_i)
        metrics_history['MAP@10'].append(map_i)
        metrics_history['NDCG@10'].append(ndcg_i)
        loss_history_epoch.append(np.mean(loss_history))

        print(f"[Epoch {epoch}]| Loss train: {loss_history_epoch[-1]:.5f}\tLoss test: {test_loss}\n"\
              f"HR@10: {hr_i:.3f}\tPrecision@10: {precision_i:.3f}\tRecall@10: {recall_i:.3f}\t"\
             f"MRR@10: {mrr_i:.3f}\tMAP@10: {map_i:.3f}\tNDCG@10 {ndcg_i:.3f} |")

    return loss_history_epoch, metrics_history

In [27]:
model = GMF(num_users=num_users, num_items=num_items, embedding_dim=32)
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [28]:
loss_history, metrics_history = train_pipeline(model, optimizer, criterion, data, 50)

  0%|          | 0/6040 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

[Epoch 0]| Loss train: 0.56520	Loss test: 0.49669503982055185
HR@10: 0.403	Precision@10: 0.040	Recall@10: 0.403	MRR@10: 0.134	MAP@10: 0.134	NDCG@10 0.196 |
[Epoch 1]| Loss train: 0.45654	Loss test: 0.4653061244461654
HR@10: 0.436	Precision@10: 0.044	Recall@10: 0.436	MRR@10: 0.150	MAP@10: 0.150	NDCG@10 0.216 |
[Epoch 2]| Loss train: 0.41397	Loss test: 0.42154212065861746
HR@10: 0.461	Precision@10: 0.046	Recall@10: 0.461	MRR@10: 0.155	MAP@10: 0.155	NDCG@10 0.226 |
[Epoch 3]| Loss train: 0.38513	Loss test: 0.38625855889055455
HR@10: 0.484	Precision@10: 0.048	Recall@10: 0.484	MRR@10: 0.162	MAP@10: 0.162	NDCG@10 0.236 |
[Epoch 4]| Loss train: 0.36867	Loss test: 0.3664403258257675
HR@10: 0.497	Precision@10: 0.050	Recall@10: 0.497	MRR@10: 0.165	MAP@10: 0.165	NDCG@10 0.242 |
[Epoch 5]| Loss train: 0.35368	Loss test: 0.3495251644867044
HR@10: 0.511	Precision@10: 0.051	Recall@10: 0.511	MRR@10: 0.168	MAP@10: 0.168	NDCG@10 0.247 |
[Epoch 6]| Loss train: 0.34198	Loss test: 0.3382027260943466
HR@10:

In [29]:
gmf = data2excel("gmf_1m_4negatives_100negativtest", loss_history, metrics_history)

In [None]:
gmf[["HR@10", "MRR@10", "NDCG@10"]].plot()

### Архитектура MLP

In [None]:
class MLP(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 embedding_dim,
                 layers):
        super(MLP, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embeddings_dim = embedding_dim
        self.layers = layers

        self.embedding_user = nn.Embedding(
            num_embeddings=self.num_users,
            embedding_dim=self.embeddings_dim
        )
        self.embedding_item = nn.Embedding(
            num_embeddings=self.num_items,
            embedding_dim=self.embeddings_dim
        )
        
        self.fc1 = nn.Linear(self.embeddings_dim * 2, layers[0])
        self.fc2 = nn.Linear(layers[0], layers[1])
        self.fc3 = nn.Linear(layers[1], layers[2])
        self.fc4 = nn.Linear(layers[2], layers[3])
        self.fc5 = nn.Linear(layers[3], layers[4])
        self.fc6 = nn.Linear(layers[4], layers[5])

        self.affine_output = nn.Linear(self.layers[-1], 1)
        self.activation_layer = nn.ReLU()
        self.activation = nn.Sigmoid()

        nn.init.xavier_uniform_(self.embedding_user.weight)
        nn.init.xavier_uniform_(self.embedding_item.weight)

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_concat = torch.cat((user_embedding,
                                    item_embedding), -1)
        layer1 = self.activation_layer(self.fc1(element_concat))
        layer2 = self.activation_layer(self.fc2(layer1))
        layer3 = self.activation_layer(self.fc3(layer2))
        layer4 = self.activation_layer(self.fc4(layer3))
        layer5 = self.activation_layer(self.fc5(layer4))
        layer6 = self.activation_layer(self.fc6(layer5))

        logits = self.affine_output(layer6)
        rating = self.activation(logits)

        return rating
        

In [None]:
model = MLP(num_users=len(num_users),
            num_items=len(num_items),
            embedding_dim=128,
            layers=[128, 64, 32, 16, 8, 4])
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
summary(model, [(128,), (128,)], dtypes=[torch.long, torch.long])

In [None]:
loss_history_mlp, metrics_history_mlp =\
    train_pipeline(model, optimizer, criterion, data, 10)

### NeuMF

In [30]:
class NeuMF(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 embedding_dim,
                 layers,
                 layers_neumf):
        super(NeuMF, self).__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.layers = layers
        self.layers_neumf = layers_neumf

        self.embedding_user_mlp = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mlp = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.embedding_user_mf = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mf = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim * 2, self.layers[0]),
            nn.BatchNorm1d(self.layers[0]),
            nn.ReLU(),
            nn.Linear(self.layers[0], self.layers[1]),
            nn.BatchNorm1d(self.layers[1]),
            nn.ReLU(),
            nn.Dropout(p=0.4),
            nn.Linear(self.layers[1], self.layers[2]),
            nn.BatchNorm1d(self.layers[2]),
            nn.ReLU(),
            nn.Linear(self.layers[2], self.layers[3]),
            nn.BatchNorm1d(self.layers[3]),
            nn.ReLU(),
        )
        
        self.fc_neumf = nn.Sequential(
            nn.Linear(self.layers[-1] + self.embedding_dim, self.layers_neumf[0]),
            nn.BatchNorm1d(self.layers_neumf[0]),
            nn.ReLU(),
            nn.Linear(self.layers_neumf[0], self.layers_neumf[1]),
            nn.BatchNorm1d(self.layers_neumf[1]),
            nn.ReLU(),
            nn.Linear(self.layers_neumf[1], self.layers_neumf[2]),
            nn.BatchNorm1d(self.layers_neumf[2]),
            nn.ReLU()
        )
        
        self.affine_output = nn.Linear(
            self.layers_neumf[-1], 1
        )
        
        self.activate = nn.Sigmoid()
        
        nn.init.xavier_uniform_(self.embedding_user_mlp.weight)
        nn.init.xavier_uniform_(self.embedding_item_mlp.weight)

        nn.init.xavier_uniform_(self.embedding_user_mf.weight)
        nn.init.xavier_uniform_(self.embedding_item_mf.weight)

    def forward(self, user_indices, item_indices):
        # Эмбеддинги для mlp
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        # Эмбеддинги для mf
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)
        
        
        element_product_mf = torch.mul(
            user_embedding_mf, item_embedding_mf
        )
        element_product_mlp = torch.cat(
            (user_embedding_mlp, item_embedding_mlp), -1
        )

        layers_mlp = self.fc(element_product_mlp)
        
        layers_neumf = self.fc_neumf(torch.cat(
                (layers_mlp, element_product_mf), -1)
        )
        
        logits = self.affine_output(layers_neumf)
        rating = self.activate(logits)

        return rating
        

In [31]:
model = NeuMF(num_users=num_users,
              num_items=num_items,
              embedding_dim=64,
              layers=[1024, 512, 256, 128],
              layers_neumf=[512, 256, 128])
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [32]:
print(model)

NeuMF(
  (embedding_user_mlp): Embedding(6041, 64)
  (embedding_item_mlp): Embedding(3707, 64)
  (embedding_user_mf): Embedding(6041, 64)
  (embedding_item_mf): Embedding(3707, 64)
  (fc): Sequential(
    (0): Linear(in_features=128, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.4, inplace=False)
    (7): Linear(in_features=512, out_features=256, bias=True)
    (8): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Linear(in_features=256, out_features=128, bias=True)
    (11): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
  )
  (fc_neumf): Sequential(
    (0): Linear(in_features=192, out_features=512, b

In [33]:
loss_history_neumf, metrics_history_neumf =\
    train_pipeline(model, optimizer, criterion, data, 50)

  0%|          | 0/6040 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

[Epoch 0]| Loss train: 0.46898	Loss test: 0.4243592038285913
HR@10: 0.459	Precision@10: 0.046	Recall@10: 0.459	MRR@10: 0.155	MAP@10: 0.155	NDCG@10 0.226 |
[Epoch 1]| Loss train: 0.35978	Loss test: 0.34537911671803095
HR@10: 0.483	Precision@10: 0.048	Recall@10: 0.483	MRR@10: 0.157	MAP@10: 0.157	NDCG@10 0.232 |
[Epoch 2]| Loss train: 0.31101	Loss test: 0.3043402291693356
HR@10: 0.511	Precision@10: 0.051	Recall@10: 0.511	MRR@10: 0.171	MAP@10: 0.171	NDCG@10 0.249 |
[Epoch 3]| Loss train: 0.28134	Loss test: 0.33356872915537866
HR@10: 0.510	Precision@10: 0.051	Recall@10: 0.510	MRR@10: 0.168	MAP@10: 0.168	NDCG@10 0.247 |
[Epoch 4]| Loss train: 0.26290	Loss test: 0.31869133977386355
HR@10: 0.526	Precision@10: 0.053	Recall@10: 0.526	MRR@10: 0.184	MAP@10: 0.184	NDCG@10 0.263 |
[Epoch 5]| Loss train: 0.24871	Loss test: 0.3242532535298774
HR@10: 0.523	Precision@10: 0.052	Recall@10: 0.523	MRR@10: 0.176	MAP@10: 0.176	NDCG@10 0.256 |
[Epoch 6]| Loss train: 0.23754	Loss test: 0.3172112918062368
HR@10:

In [34]:
neumf = data2excel("neumf_1m_1024_4neg_100negtest", loss_history_neumf, metrics_history_neumf)

In [None]:
neumf

------

In [55]:
class UserItemRatingDatasetUsers(Dataset):
    def __init__(self, user:list, item:list, rating:list, sex:list, age_group:list, occupation:list):
        super(UserItemRatingDatasetUsers, self).__init__()
        
        self.user = torch.tensor(user, dtype=torch.long)
        self.item = torch.tensor(item, dtype=torch.long)
        self.target = torch.tensor(rating, dtype=torch.float)
        
        self.sex = torch.tensor(sex, dtype=torch.long)
        self.age = torch.tensor(age_group, dtype=torch.long)
        self.occupation = torch.tensor(occupation, dtype=torch.long)

    def __len__(self):
        return len(self.target)
        
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.target[idx], self.sex[idx], self.age[idx], self.occupation[idx]


In [56]:
class NCFDataUsers(object):
    def __init__(self, 
                 ratings, 
                 num_negatives, 
                 num_negatives_test, 
                 batch_size: int):
        self.ratings = ratings
        self.num_negatives = num_negatives
        self.num_negatives_test = num_negatives_test
        self.batch_size = batch_size

        self.preprocess_ratings = self._reindex(self.ratings)
        self.user_pool = set(self.ratings["user_id"].unique())
        self.item_pool = set(self.ratings["item_id"].unique())
        self.sex_poll = set(self.ratings["sex_encoded"].unique())
        self.age_poll = set(self.ratings["age_group_encoded"].unique())
        self.occupation_pool = set(self.ratings["occupation"].unique())

        self.train_ratings, self.test_ratings = self._leave_p_out(
            self.preprocess_ratings
        )
        self.negatives = self._negative_sampling(self.preprocess_ratings)

    def _reindex(self, ratings):
        user = list(ratings["user_id"].drop_duplicates())
        self.user2id = {w: i for i, w in enumerate(user)}

        item = list(ratings["item_id"].drop_duplicates())
        self.item2id = {w: i for i, w in enumerate(item)}

        sex = list(ratings["sex_encoded"].drop_duplicates())
        self.sex2id = {w: i for i, w in enumerate(sex)}
        age = list(ratings["age_group_encoded"].drop_duplicates())
        self.age2id = {w: i for i, w in enumerate(age)}
        occupation = list(ratings["occupation"].drop_duplicates())
        self.occupation2id = {w: i for i, w in enumerate(occupation)}

        ratings["user_id"] = ratings["user_id"].apply(lambda x: self.user2id[x])
        ratings["item_id"] = ratings["item_id"].apply(lambda x: self.item2id[x])
        ratings["rating"] = ratings["rating"].apply(lambda x: float(x > 0))
        ratings["sex_encoded"] = ratings["sex_encoded"].apply(lambda x: self.sex2id[x])
        ratings["age_group_encoded"] = ratings["age_group_encoded"].apply(lambda x: self.age2id[x])
        ratings["occupation"] = ratings["occupation"].apply(lambda x: self.occupation2id[x])

        return ratings

    def _leave_one_out(self, ratings):
        ratings["rank_latest"] = ratings.groupby(["user_id"])["timestamp"].rank(
            method="first", ascending=False
        )
        test = ratings.loc[ratings["rank_latest"] == 1]
        train = ratings.loc[ratings["rank_latest"] > 1]
        
        return train[["user_id", "item_id", "rating"]], test[["user_id", "item_id", "rating"]]
    
    def _leave_p_out(self, ratings, p=10):
        ratings["rank_latest"] = ratings.groupby(["user_id"])["timestamp"].rank(
            method="first", ascending=False
        )
        test = ratings.loc[ratings["rank_latest"] < p+1]
        train = ratings.loc[ratings["rank_latest"] > p]
        return (train[["user_id", "item_id", "rating", "sex_encoded", "age_group_encoded", "occupation"]],
                 test[["user_id", "item_id", "rating", "sex_encoded", "age_group_encoded", "occupation"]])

    def _negative_sampling(self, ratings):
        interact_status = (
            ratings.groupby(by=['user_id', "sex_encoded", "age_group_encoded", "occupation"])['item_id']
            .apply(set)
            .reset_index()
            .rename(columns={'item_id': 'interacted_items'})
        )

        interact_status["negative_samples"] = [
            random.sample(
                self.item_pool - interact_status["interacted_items"].iloc[i],
                self.num_negatives_test,
            )
            for i in range(interact_status["user_id"].shape[0])
        ]
        
        return interact_status[["user_id", "negative_samples", "interacted_items", "sex_encoded", "age_group_encoded", "occupation"]]

    def get_train_instance(self):
        users, items, ratings = [], [], []        
        train_ratings_negatives = pd.DataFrame()
        train_ratings_negatives['user_id'] = self.negatives['user_id']
        train_ratings_negatives['sex_encoded'] = self.negatives['sex_encoded']
        train_ratings_negatives['age_group_encoded'] = self.negatives['age_group_encoded']
        train_ratings_negatives['occupation'] = self.negatives['occupation']
        train_ratings_negatives["negatives"] = [
            random.sample(
                self.item_pool - self.negatives['interacted_items'].iloc[i],
                len(self.negatives['interacted_items'].iloc[i]) if len(self.item_pool) - 2 * len(self.negatives['interacted_items'].iloc[i]) > 0 else self.num_negatives,
            )
            for i in range(self.negatives['interacted_items'].shape[0])
        ]

        train_ratings_negatives = train_ratings_negatives.explode("negatives")

        users = np.append(self.train_ratings["user_id"], 
                          train_ratings_negatives["user_id"]).astype(np.int64)
        items = np.append(self.train_ratings["item_id"], 
                          train_ratings_negatives["negatives"]).astype(np.int64)
        ratings = np.append(self.train_ratings["rating"], 
                            [0 for i in range(train_ratings_negatives.shape[0])]).astype(np.int64)
        
        sex = np.append(self.train_ratings["sex_encoded"],
                        train_ratings_negatives["sex_encoded"]).astype(np.int64)
        age = np.append(self.train_ratings["age_group_encoded"],
                        train_ratings_negatives["age_group_encoded"]).astype(np.int64)
        occupation = np.append(self.train_ratings["occupation"],
                               train_ratings_negatives["occupation"]).astype(np.int64)
        
        
        assert len(users) == len(items) and len(items) == len(ratings)

        dataset = UserItemRatingDatasetUsers(user=users, item=items, rating=ratings, sex=sex,
                                             age_group=age, occupation=occupation)
        return DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def get_test_instance(self, p=10):
        users, items, ratings, sex, age, occupation = [], [], [], [], [], []
        test_ratings = pd.merge(
            self.test_ratings,
            self.negatives[["user_id", "negative_samples"]],
            on="user_id",
        )
        
        for user in tqdm(np.unique(test_ratings['user_id'])):
            for row in (test_ratings.loc[test_ratings['user_id'] == user]).itertuples():
                users.append(int(row.user_id))
                items.append(int(row.item_id))
                ratings.append(float(row.rating))
                sex.append(int(row.sex_encoded))
                age.append(int(row.age_group_encoded))
                occupation.append(int(row.occupation))
            for item_negative in test_ratings.loc[test_ratings['user_id'] == user]['negative_samples'].iloc[0]:
                users.append(int(user))
                items.append(int(item_negative))
                ratings.append(float(0))
                sex.append(int(row.sex_encoded))
                age.append(int(row.age_group_encoded))
                occupation.append(int(row.occupation))

        dataset = UserItemRatingDatasetUsers(user=users, item=items, rating=ratings, sex=sex,
                                             age_group=age, occupation=occupation)
        return DataLoader(
            dataset, batch_size=self.num_negatives_test+p, shuffle=False, num_workers=4
        )

In [57]:
from sklearn.preprocessing import LabelEncoder

In [58]:
ml_10m

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968
3,1,3408,4.0,978300275
4,1,2355,5.0,978824291
...,...,...,...,...
1000204,6040,1091,1.0,956716541
1000205,6040,1094,5.0,956704887
1000206,6040,562,5.0,956704746
1000207,6040,1096,4.0,956715648


In [12]:
users = pd.read_csv("./ml-1m/users.dat", sep='::', engine='python', encoding="ISO-8859-1",
                    names=["user_id", "sex", "age_group", "occupation", "zip_code"])
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [13]:
le = LabelEncoder()
users["sex_encoded"] = le.fit_transform(users.sex)
users["age_group_encoded"] = le.fit_transform(users.age_group)

In [14]:
users

Unnamed: 0,user_id,sex,age_group,occupation,zip_code,sex_encoded,age_group_encoded
0,1,F,1,10,48067,0,0
1,2,M,56,16,70072,1,6
2,3,M,25,15,55117,1,2
3,4,M,45,7,02460,1,4
4,5,M,25,20,55455,1,2
...,...,...,...,...,...,...,...
6035,6036,F,25,15,32603,0,2
6036,6037,F,45,1,76006,0,4
6037,6038,F,56,1,14706,0,6
6038,6039,F,45,0,01060,0,4


In [15]:
ml_10m_users = pd.merge(ml_10m, users, on="user_id")
ml_10m_users

Unnamed: 0,user_id,item_id,rating,timestamp,sex,age_group,occupation,zip_code,sex_encoded,age_group_encoded
0,1,1193,5.0,978300760,F,1,10,48067,0,0
1,1,661,3.0,978302109,F,1,10,48067,0,0
2,1,914,3.0,978301968,F,1,10,48067,0,0
3,1,3408,4.0,978300275,F,1,10,48067,0,0
4,1,2355,5.0,978824291,F,1,10,48067,0,0
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1.0,956716541,M,25,6,11106,1,2
1000205,6040,1094,5.0,956704887,M,25,6,11106,1,2
1000206,6040,562,5.0,956704746,M,25,6,11106,1,2
1000207,6040,1096,4.0,956715648,M,25,6,11106,1,2


In [32]:
data = NCFDataUsers(ml_10m_users, num_negatives=4, num_negatives_test=100, batch_size=512)

In [33]:
class NeuMF(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 embedding_dim,
                 layers,
                 layers_neumf):
        super(NeuMF, self).__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.layers = layers
        self.layers_neumf = layers_neumf

        self.embedding_user_mlp = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mlp = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.embedding_user_mf = nn.Embedding(
            num_embeddings=self.num_users + 1,
            embedding_dim=self.embedding_dim
        )
        self.embedding_item_mf = nn.Embedding(
            num_embeddings=self.num_items + 1,
            embedding_dim=self.embedding_dim
        )

        self.embeddings_sex = nn.Embedding(3, 2)
        self.embeddings_occupation = nn.Embedding(22, 11)
        self.embeddings_age_group = nn.Embedding(8, 4)

        self.fc = nn.Sequential(
            nn.Linear(self.embedding_dim * 2 + 2 + 11 + 4, self.layers[0]),
            nn.BatchNorm1d(self.layers[0]),
            nn.ReLU(),
            nn.Linear(self.layers[0], self.layers[1]),
            nn.BatchNorm1d(self.layers[1]),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(self.layers[1], self.layers[2]),
            nn.BatchNorm1d(self.layers[2]),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(self.layers[2], self.layers[3]),
            nn.BatchNorm1d(self.layers[3]),
            nn.ReLU(),
        )
        
        self.fc_neumf = nn.Sequential(
            nn.Linear(self.layers[-1] + self.embedding_dim, self.layers_neumf[0]),
            nn.BatchNorm1d(self.layers_neumf[0]),
            nn.ReLU(),
            nn.Linear(self.layers_neumf[0], self.layers_neumf[1]),
            nn.BatchNorm1d(self.layers_neumf[1]),
            nn.ReLU(),
            nn.Linear(self.layers_neumf[1], self.layers_neumf[2]),
            nn.BatchNorm1d(self.layers_neumf[2]),
            nn.ReLU()
        )
        
        self.affine_output = nn.Linear(
            self.layers_neumf[-1], 1
        )
        
        self.activate = nn.Sigmoid()
        
        nn.init.xavier_uniform_(self.embedding_user_mlp.weight)
        nn.init.xavier_uniform_(self.embedding_item_mlp.weight)

        nn.init.xavier_uniform_(self.embedding_user_mf.weight)
        nn.init.xavier_uniform_(self.embedding_item_mf.weight)

    def forward(self, user_indices, item_indices, sex_indices, age_group_indices, occupation_indices):
        # Эмбеддинги для mlp
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        # Эмбеддинги для mf
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        user_embedding_sex = self.embeddings_sex(sex_indices)
        user_embedding_age_group = self.embeddings_age_group(age_group_indices)
        user_embedding_occupation = self.embeddings_occupation(occupation_indices)

        user_features = torch.cat(
            (user_embedding_sex, user_embedding_age_group, user_embedding_occupation), -1
        )
        
        element_product_mf = torch.mul(
            user_embedding_mf, item_embedding_mf
        )
        element_product_mlp = torch.cat(
            (user_embedding_mlp, item_embedding_mlp, user_features), -1
        )

        layers_mlp = self.fc(element_product_mlp)
        
        layers_neumf = self.fc_neumf(torch.cat(
                (layers_mlp, element_product_mf), -1)
        )
        
        logits = self.affine_output(layers_neumf)
        rating = self.activate(logits)

        return rating
        

In [34]:
@torch.no_grad()
def metrics_neumf(model, test_loader, criterion, top_k, device):
    _hr, _precision, _recall, _mrr, _map, _ndcg = [], [], [], [], [], []

    test_loss = []
    for user, item, label, sex, age, occupation in test_loader:
        user = user.to(device)
        item = item.to(device)
        sex = sex.to(device)
        age = age.to(device)
        occupation = occupation.to(device)
        label = label.to(device)

        predictions = model(user, item, sex, age, occupation)
        predictions = predictions.view(-1)
        loss = criterion(predictions.to(torch.float64), 
                            label.to(torch.float64))
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        y_true = item[0].item()
        _hr.append(hit(recommends, y_true))
        _precision.append(precision(recommends, y_true))
        _recall.append(recall(recommends, y_true))
        _mrr.append(mrr(recommends, y_true))
        _map.append(map_k(recommends, y_true))
        _ndcg.append(ndcg(recommends, y_true))
        test_loss.append(loss.cpu().numpy())
    
    return np.mean(test_loss), np.mean(_hr), np.mean(_precision), np.mean(_recall), np.mean(_mrr), np.mean(_map), np.mean(_ndcg)

In [49]:
def train_pipeline_neumf(model,
                   optimizer,
                   criterion,
                   data,
                   num_epoch):
    loss_history_epoch = []
    metrics_history = {"Test_loss": [], "HR@10": [], "Precision@10": [], "Recall@10": [],
                      "MRR@10": [], "MAP@10": [], "NDCG@10": []}
    test_loader = data.get_test_instance()
    train_loader = data.get_train_instance()

    sheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.0001, 
                                                   steps_per_epoch=len(train_loader), epochs=num_epoch)

    for epoch in trange(num_epoch):
        loss_history = []
        model.train()

        for user, item, label, sex, age, occupation in train_loader:
            user = user.to(DEVICE)
            item = item.to(DEVICE)
            sex = sex.to(DEVICE)
            age = age.to(DEVICE)
            occupation = occupation.to(DEVICE)
            label = label.to(DEVICE)

            optimizer.zero_grad()
            prediction = model(user, item, sex, age, occupation)
            
            loss = criterion(prediction.view(-1).to(torch.float64), 
                            label.to(torch.float64))
            loss.backward()
            optimizer.step()
            sheduler.step()

            loss_history.append(loss.item())

        train_loader = data.get_train_instance()

        model.eval()
        test_loss, hr_i, precision_i, recall_i, mrr_i, map_i, ndcg_i = metrics_neumf(model, test_loader, criterion, 10, DEVICE)
        metrics_history['Test_loss'].append(test_loss)
        metrics_history['HR@10'].append(hr_i)
        metrics_history['Precision@10'].append(precision_i)
        metrics_history['Recall@10'].append(recall_i)
        metrics_history['MRR@10'].append(mrr_i)
        metrics_history['MAP@10'].append(map_i)
        metrics_history['NDCG@10'].append(ndcg_i)
        loss_history_epoch.append(np.mean(loss_history))

        print(f"[Epoch {epoch}]| Loss train: {loss_history_epoch[-1]:.5f}\tLoss test: {test_loss}\n"\
              f"HR@10: {hr_i:.3f}\tPrecision@10: {precision_i:.3f}\tRecall@10: {recall_i:.3f}\t"\
             f"MRR@10: {mrr_i:.3f}\tMAP@10: {map_i:.3f}\tNDCG@10 {ndcg_i:.3f} |")

    return loss_history_epoch, metrics_history

In [50]:
model = NeuMF(num_users=num_users,
              num_items=num_items,
              embedding_dim=64,
              layers=[1024, 512, 256, 128],
              layers_neumf=[512, 256, 128])
model.to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, amsgrad=True)

In [51]:
print(model)

NeuMF(
  (embedding_user_mlp): Embedding(6041, 64)
  (embedding_item_mlp): Embedding(3707, 64)
  (embedding_user_mf): Embedding(6041, 64)
  (embedding_item_mf): Embedding(3707, 64)
  (embeddings_sex): Embedding(3, 2)
  (embeddings_occupation): Embedding(22, 11)
  (embeddings_age_group): Embedding(8, 4)
  (fc): Sequential(
    (0): Linear(in_features=145, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.15, inplace=False)
    (7): Linear(in_features=512, out_features=256, bias=True)
    (8): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Dropout(p=0.15, inplace=False)
    (11): Linear(in_features=256, out_features=128, bias=True)
    (12): BatchNorm1d(128

In [53]:
loss_history, metrics_history = train_pipeline_neumf(model, optimizer, criterion, data, 20)

  0%|          | 0/6040 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
neumf = data2excel("neumf_with_users_1m_1024_4neg_100negtest", loss_history, metrics_history)