In [67]:
import pandas as pd
from IPython.display import display, clear_output

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from copy import deepcopy
import json
import torch
from torch import nn
from torch.nn import functional as F

import torch
import os
from scipy.sparse import csr_matrix
import numpy as np
from torch import nn
import random
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import math
from sklearn.model_selection import train_test_split

In [68]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [69]:
DATASET_PATH = '/content/drive/My Drive/dataset'
# Constants
LR = 1e-3 # learning rate, controls the speed of the training
WEIGHT_DECAY = 0.01 # lambda for L2 reg. ()
NUM_EPOCHS = 200 # num training epochs (how many times each instance will be processed)
GAMMA = 0.9995 # learning rate scheduler parameter
BATCH_SIZE = 3000 # training batch size
EVAL_BATCH_SIZE = 3000 # evaluation batch size.
DEVICE = 'cuda' #'cuda' # device to make the calculations on

## Загрузка данных

In [70]:
interactions_df = pd.read_csv(f'{DATASET_PATH}/interactions_processed_kion.csv')
users_df = pd.read_csv(f'{DATASET_PATH}/users_processed_kion.csv')
items_df = pd.read_csv(f'{DATASET_PATH}/items_processed_kion.csv')

Делаем предобработку

In [71]:
interactions_df = interactions_df[interactions_df['last_watch_dt'] < '2021-04-01']

Оставим только тех пользователей, у которых есть хотя б 5 просмотров

In [72]:
users_interactions_count_df = interactions_df.groupby(['user_id', 'item_id']).size().groupby('user_id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['user_id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 86614
# users with at least 5 interactions: 14563


Оставим только те просмотры, которые есть у пользователей с хотя бы 5 просмотрами

In [73]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 263874
# of interactions from users with at least 5 interactions: 142670


Сглаживаем веса

In [74]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['user_id', 'item_id'])['watched_pct'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 142670


Unnamed: 0,user_id,item_id,watched_pct
0,21,849,6.375039
1,21,4345,6.658211
2,21,10283,6.658211
3,21,12261,6.658211
4,21,15997,6.658211
5,32,952,6.044394
6,32,4382,4.954196
7,32,4807,6.658211
8,32,10436,6.658211
9,32,12132,6.658211


Делим на train и test подвыборки

In [75]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['user_id'],
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 114136
# interactions on Test set: 28534


In [76]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('user_id')
interactions_train_indexed_df = interactions_train_df.set_index('user_id')
interactions_test_indexed_df = interactions_test_df.set_index('user_id')

In [77]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['item_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

## Обучение

Для обучения необходимо преобразовать данные в матрицу

In [79]:
total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())
total_df['user_id'], users_keys = total_df.user_id.factorize()
total_df['item_id'], items_keys = total_df.item_id.factorize()

train_encoded = total_df.iloc[:len(interactions_train_df)].values
test_encoded = total_df.iloc[len(interactions_train_df):].values

  total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())


In [80]:
shape = [int(total_df['user_id'].max()+1), int(total_df['item_id'].max()+1)]
X_train = csr_matrix((train_encoded[:, 2], (train_encoded[:, 0], train_encoded[:, 1])), shape=shape).toarray()
X_test = csr_matrix((test_encoded[:, 2], (test_encoded[:, 0], test_encoded[:, 1])), shape=shape).toarray()

Инициализация объекта датасета и dataloaders

In [82]:
class UserOrientedDataset(Dataset):
    def __init__(self, X):
        super().__init__() # to initialize the parent class
        self.X = X.astype(np.float32)
        self.len = len(X)

    def __len__(self): # We use __func__ for implementing in-built python functions
        return self.len

    def __getitem__(self, index):
        return self.X[index]

In [83]:
train_dl = DataLoader(
    UserOrientedDataset(X_train),
    batch_size = BATCH_SIZE,
    shuffle = True
)

test_dl = DataLoader(
    UserOrientedDataset(X_test),
    batch_size = EVAL_BATCH_SIZE,
    shuffle = False
)

dls = {'train': train_dl, 'test': test_dl}

## Модель

В задании сказано изменить AE - я попыталась сделать VAE

In [84]:
class VAE(nn.Module):
    def __init__(self, x, h1, h2, z):
        super(VAE, self).__init__()
        self.fc1 = nn.Linear(x, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc_mean = nn.Linear(h2, z)
        self.fc_sd = nn.Linear(h2, z)
        # decoder
        self.fc4 = nn.Linear(z, h2)
        self.fc5 = nn.Linear(h2, h1)
        self.fc6 = nn.Linear(h1, x)

    def decoder(self, z):
        h1 = F.relu(self.fc4(z))
        h2 = F.relu(self.fc5(h1))
        return F.sigmoid(self.fc6(h2))

    def encoder(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        return self.fc_mean(h2), self.fc_sd(h2) # mu, log_var

    def sampling(self, mu, log_var):
        std = torch.exp(0.5*log_var)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)

    def forward(self, x):
        mu, log_var = self.encoder(x.view(-1, 8287))
        z = self.sampling(mu, log_var)
        return self.decoder(z), mu, log_var

Инициализурем новый лосс

In [85]:
def loss_function(reconstructed_x, x, mu, log_var):
    loss = -torch.mean(torch.sum(F.log_softmax(reconstructed_x, 1) * x, -1))
    regularized_term = -0.5 * torch.sum(1 + log_var - mu.pow(2) -
                      log_var.exp())

    return loss + regularized_term

In [86]:
model = VAE(x=8287, h1= 1024, h2=512, z=2)
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)


Начинаем обучение

In [87]:
# Train loop
for epoch in range(NUM_EPOCHS):
    for stage in ['train', 'test']:
        with torch.set_grad_enabled(stage == 'train'):
            if stage == 'train':
                model.train()
            else:
                model.eval()

            loss_at_stage = 0
            for batch in dls[stage]:
                batch = batch.to(DEVICE)
                latent, mu, log_var = model(batch)
                loss = loss_function(latent, batch, mu, log_var)
                if stage == "train":
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                loss_at_stage += loss.item()
            loss_at_stage = (loss_at_stage / len(dls[stage].dataset))

    if (epoch == NUM_EPOCHS - 1) or epoch % 10 == 9:
        print(f"Current epoch: {epoch}, loss {loss_at_stage}")

Current epoch: 9, loss 0.02563776436525248
Current epoch: 19, loss 0.025617372512424467
Current epoch: 29, loss 0.025612449266176618
Current epoch: 39, loss 0.025611540318877867
Current epoch: 49, loss 0.025609590142324782
Current epoch: 59, loss 0.025609759096505674
Current epoch: 69, loss 0.02560828094388122
Current epoch: 79, loss 0.025606887923208376
Current epoch: 89, loss 0.02560782594634291
Current epoch: 99, loss 0.025606118068421335
Current epoch: 109, loss 0.025606432401781135
Current epoch: 119, loss 0.025606263447600242
Current epoch: 129, loss 0.02560655630151379
Current epoch: 139, loss 0.025606239872598257
Current epoch: 149, loss 0.025605677477828687
Current epoch: 159, loss 0.025605846693954045
Current epoch: 169, loss 0.025605020521106706
Current epoch: 179, loss 0.025604554259956336
Current epoch: 189, loss 0.0256053411411337
Current epoch: 199, loss 0.025604930674154695


In [96]:
with torch.no_grad():
    X_pred = model(torch.Tensor(X_test).to(DEVICE))[0].detach().cpu().numpy()
X_pred

array([[9.9999571e-01, 9.9998987e-01, 9.9999678e-01, ..., 3.9976035e-06,
        7.9386673e-06, 6.6845137e-06],
       [9.9999714e-01, 9.9999273e-01, 9.9999774e-01, ..., 2.6306498e-06,
        5.4030065e-06, 4.7245139e-06],
       [9.9999952e-01, 9.9999869e-01, 9.9999964e-01, ..., 3.7480351e-07,
        7.8271728e-07, 1.1289392e-06],
       ...,
       [9.9999392e-01, 9.9998605e-01, 9.9999571e-01, ..., 6.0406178e-06,
        1.0979017e-05, 8.9953182e-06],
       [9.9998331e-01, 9.9996436e-01, 9.9998677e-01, ..., 1.6018505e-05,
        2.7572454e-05, 2.4391717e-05],
       [9.9999154e-01, 9.9997818e-01, 9.9999297e-01, ..., 8.3833320e-06,
        1.4163673e-05, 1.3450983e-05]], dtype=float32)

Класс для оценки качества модели

In [98]:
class AERecommender:

    MODEL_NAME = 'Autoencoder'

    def __init__(self, X_preds, X_train_and_val, X_test):

        self.X_preds = X_preds
        self.X_train_and_val = X_train_and_val
        self.X_test = X_test

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_select_idx, topn=10, verbose=False):
        user_preds = self.X_preds[user_id][items_to_select_idx]
        items_idx = items_to_select_idx[np.argsort(-user_preds)[:topn]]
        return items_idx

    def evaluate(self, size=100):

        X_total = self.X_train_and_val + self.X_test

        true_5 = []
        true_10 = []

        for user_id in range(len(X_test)):
            non_zero = np.argwhere(self.X_test[user_id] > 0).ravel()
            all_nonzero = np.argwhere(X_total[user_id] > 0).ravel()
            select_from = np.setdiff1d(np.arange(X_total.shape[1]), all_nonzero)

            for non_zero_idx in non_zero:
                random_non_interacted_100_items = np.random.choice(select_from, size=20, replace=False)
                preds = self.recommend_items(user_id, np.append(random_non_interacted_100_items, non_zero_idx), topn=10)
                true_5.append(non_zero_idx in preds[:5])
                true_10.append(non_zero_idx in preds)

        return {"recall@5": np.mean(true_5), "recall@10": np.mean(true_10)}

ae_recommender_model = AERecommender(X_pred, X_train, X_train)
ae_global_metrics = ae_recommender_model.evaluate()
ae_global_metrics

{'recall@5': 0.8620925039250295, 'recall@10': 0.9465415858094337}

Судя по метрикам, модель стала немного лучше, чем AE, который был в лекции

Делаем предсказания для оффлайн рекомендаций

In [99]:
test_data = total_df.values
shape = [int(total_df["user_id"].max() + 1), int(total_df["item_id"].max() + 1)]
X = csr_matrix(
    (test_data[:, 2], (test_data[:, 0], test_data[:, 1])), shape=shape
).toarray()

In [100]:
with torch.no_grad():
    X_pred = model(torch.Tensor(torch.Tensor(X).to(DEVICE)))[0].detach().cpu().numpy()
X_pred

array([[9.9995649e-01, 9.9991584e-01, 9.9996269e-01, ..., 3.8887032e-05,
        5.8243590e-05, 6.8062865e-05],
       [1.0000000e+00, 9.9999988e-01, 1.0000000e+00, ..., 1.8714410e-08,
        5.6659996e-08, 5.6516768e-08],
       [9.9999547e-01, 9.9998915e-01, 9.9999654e-01, ..., 4.2899264e-06,
        8.4529274e-06, 7.1942154e-06],
       ...,
       [9.9994528e-01, 9.9988997e-01, 9.9995482e-01, ..., 5.6402379e-05,
        7.9073696e-05, 7.4777956e-05],
       [9.9997616e-01, 9.9994648e-01, 9.9998045e-01, ..., 2.4371164e-05,
        3.6535075e-05, 3.4328150e-05],
       [9.9999917e-01, 9.9999690e-01, 9.9999928e-01, ..., 8.0697600e-07,
        1.7524738e-06, 1.8097214e-06]], dtype=float32)

In [101]:
def get_reco(user_id: int, num_reco: int=10):
    all_nonzero = np.argwhere(X[user_id] > 0).ravel()
    select_from = np.setdiff1d(np.arange(X.shape[1]), all_nonzero)
    random_non_interactive = np.random.choice(select_from, size=20, replace=False)
    return random_non_interactive[np.argsort(-X_pred[user_id][random_non_interactive])[:num_reco]]

In [102]:
user_recos = {}
users = interactions_full_indexed_df.index.unique().tolist()
for i, user_id in enumerate(users):
    recos_for_user = get_reco(i)
    user_recos.update({user_id: recos_for_user.tolist()})

In [103]:
with open("vae_recos.json", "w") as r:
    json.dump(user_recos, r)