In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [2]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

In [3]:
df = df.rename(columns={"userId": "userId:token", "movieId": "movieId:token", "timestamp_int": "timestamp:float"})


In [4]:
test_part = 0.2
train_len = int(df.shape[0] * (1 - test_part))
test_len = df.shape[0] - train_len 
df_train, df_test = df.drop(['timestamp'], axis=1).head(train_len), df.drop(['timestamp'], axis=1).tail(test_len)

In [5]:
df_train.head()

Unnamed: 0,userId:token,movieId:token,rating
4182421,28507,1176,4.0
18950979,131160,1079,3.0
18950936,131160,47,5.0
18950930,131160,21,3.0
12341178,85252,45,3.0


In [6]:
N_USERS = df['userId:token'].max() + 1
N_ITEMS = df['movieId:token'].max() + 1

In [7]:
import torch

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [8]:
from math import log2

def apk(pred, target, k):
    if len(pred) >= k:
        pred = pred[:k]

    ans, cnt = 0, 0
    tot = len(pred) 
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            cnt += 1
            ans += cnt / (i + 1)
            s.add(pred[i])
    return ans / tot


def mapk(pred, target, k):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += apk(cur_pred, cur_target, k)
    return sum_metric / len(pred) 


def mr(pred, target):
    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1 / (i + 1)
            s.add(pred[i])
            break
    return ans

def mrr(pred, target):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += mr(cur_pred, cur_target)
    return sum_metric / len(pred) 


def ndcgunique(pred, target):
    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1 / (log2(i + 2))
            s.add(pred[i])
    return ans / tot

def ndcg(pred, target):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += ndcgunique(cur_pred, cur_target)
    return sum_metric / len(pred) 


def precisionunique(pred, target, k):
    if len(pred) >= k:
        pred = pred[:k]

    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1
            s.add(pred[i])
    return ans / tot


def precision(pred, target, k):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += precisionunique(cur_pred, cur_target, k)
    return sum_metric / len(pred) 

In [9]:
class EmbeddingMethodsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        return {
            'user' : torch.LongTensor([df['userId:token'].iloc[index]]).item(),
            'item' : torch.LongTensor([df['movieId:token'].iloc[index]]).item(),
            'rating' : torch.FloatTensor([df['rating'].iloc[index]]).item()
        }

    def __len__(self):
        return self.df.shape[0]

In [10]:
dataset = EmbeddingMethodsDataset(df)
loader = torch.utils.data.DataLoader(dataset, batch_size=64)

In [11]:
import torch.nn as nn 

class DSSM(nn.Module):
    def __init__(self, n_users, n_items, hidden_dim):
        super().__init__()

        self.embeddings_users = nn.Embedding(n_users, hidden_dim)
        self.embeddings_items = nn.Embedding(n_items, hidden_dim)

        self.head_users = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        ) 

        self.head_items = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        ) 

    def forward(self, users, items):
        users_hidden = self.head_users(self.embeddings_users(users))
        items_hidden = self.head_items(self.embeddings_items(items))
        return torch.sum(users_hidden * items_hidden, -1)

In [12]:
model = DSSM(N_USERS, N_ITEMS, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)


In [13]:
from tqdm.auto import trange, tqdm

epochs = 4

for _ in trange(epochs):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred = model(batch['user'], batch['item'])
        loss = torch.sum((5.0 - pred) * (5.0 - pred))
        loss.backward()
        optimizer.step()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

In [14]:
torch.save(model, 'dssm.pt')

In [15]:


class FastKNN:
    def __init__(self, num_projections = 10):
        self.num_projections = num_projections
        self.distributions = []
        self.projection_matrixes = []
        self.embeddings = None

    def binary_search(self, index_projection, find_value):
        left_index, right_index = 0, len(self.distributions[index_projection]) - 1
        while left_index < right_index - 1:
            middle_index = (left_index + right_index) // 2
            middle_value = self.distributions[index_projection][middle_index][0]
            if middle_value < find_value:
                left_index = middle_index
            else:
                right_index = middle_index

        return left_index

    def find_k_nearest(self, index_projection, find_value, k):
        i0 = self.binary_search(index_projection, find_value)
        left_index = max(0, i0 - k // 2)
        right_index = min(len(self.distributions[index_projection]), left_index + k )
        if right_index - left_index != k:
            left_index = right_index - k
        return [self.distributions[index_projection][i][1] for i in range(left_index, right_index)]

    def fit(self, vectors):
        for i in trange(self.num_projections):
            projection = torch.randn((vectors.shape[1], 1))
            values = (vectors @ projection).cpu().numpy().reshape(-1)
            self.distributions.append(sorted([(values[j], j) for j in range(len(values))]))
            self.projection_matrixes.append(projection.cpu())

    def predict(self, vector, k):
        k0 = 1 + (k // self.num_projections)
        values = [torch.sum(vector * proj.reshape(-1)).item() for proj in self.projection_matrixes]
        neighb = []
        for i, value in enumerate(values):
            cur_n = self.find_k_nearest(i, value, k0)
            neighb += cur_n
        neighb = list(set(neighb))
        return neighb[:k]

In [16]:
import numpy as np

vectors = model.head_items(model.embeddings_items.weight).cpu().detach()
knn = FastKNN()
knn.fit(vectors)

  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
g_test = [[] for i in range(df['userId:token'].max() + 1)]
for i in trange(df_test.shape[0]):
    usr, item = df_test['userId:token'].iloc[i], df_test['movieId:token'].iloc[i]
    g_test[usr].append(item)


  0%|          | 0/4000053 [00:00<?, ?it/s]

In [18]:
g_train = [[] for i in range(df_train['userId:token'].max() + 1)]
for i in trange(df_train.shape[0]):
    cur_user = df['userId:token'].iloc[i]
    cur_item = df['movieId:token'].iloc[i]
    cur_rating = df['rating'].iloc[i]
    g_train[cur_user].append((cur_rating, cur_item))

for i in range(len(g_train)):
    g_train[i] = sorted(g_train[i])
    g_train[i] = g_train[i][-5:]


  0%|          | 0/16000210 [00:00<?, ?it/s]

In [19]:
predictions = []
target = []
k = 40
for i in trange(df_train['userId:token'].max() + 1):
    if len(g_test[i]) == 0 or len(g_train[i]) == 0:
        continue
    out = []
    for l in g_train[i]:
        cur_item = l[1]
        neighb = knn.predict(vectors[cur_item], k)[:(k // 5)]
        for e in neighb:
            out.append(e)
    
    predictions.append(out)
    target.append(g_test[i])


  0%|          | 0/138494 [00:00<?, ?it/s]

In [20]:
for i in range(1, 20):
    print(f"precision@{i} : {precision(predictions, target, i)}")
print(f"ndcg: {ndcg(predictions, target)}")
print(f"mrr: {mrr(predictions, target)}")
for i in range(1, 20):
    print(f"mapk@{i} : {mapk(predictions, target, i)}")


precision@1 : 0.0
precision@2 : 0.05245436824384193
precision@3 : 0.03496957882922817
precision@4 : 0.026227184121920964
precision@5 : 0.020981747297536973
precision@6 : 0.017484789414614087
precision@7 : 0.01498696235538341
precision@8 : 0.013135743398901294
precision@9 : 0.011678677614350281
precision@10 : 0.010513024986709297
precision@11 : 0.009559309200457616
precision@12 : 0.008764546045247854
precision@13 : 0.008092054144685808
precision@14 : 0.007515632515632517
precision@15 : 0.0070160671037864525
precision@16 : 0.006578947368421052
precision@17 : 0.006196511033972346
precision@18 : 0.005856567625573549
precision@19 : 0.005552407733848242
ndcg: 0.0016952813022376829
mrr: 0.05245436824384193
mapk@1 : 0.0
mapk@2 : 0.026227184121920964
mapk@3 : 0.017484789414614087
mapk@4 : 0.013113592060960482
mapk@5 : 0.010490873648768486
mapk@6 : 0.008742394707307043
mapk@7 : 0.007493481177691705
mapk@8 : 0.006562333864965444
mapk@9 : 0.005834416287632738
mapk@10 : 0.005252082225766486
mapk@11