In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [2]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

In [3]:
df

Unnamed: 0,userId,movieId,rating,timestamp
4182421,28507,1176,4.0,1995-01-09 11:46:44
18950979,131160,1079,3.0,1995-01-09 11:46:49
18950936,131160,47,5.0,1995-01-09 11:46:49
18950930,131160,21,3.0,1995-01-09 11:46:49
12341178,85252,45,3.0,1996-01-29 00:00:00
...,...,...,...,...
7819902,53930,118706,3.5,2015-03-31 06:00:51
2508834,16978,2093,3.5,2015-03-31 06:03:17
12898546,89081,55232,3.5,2015-03-31 06:11:26
12898527,89081,52458,4.0,2015-03-31 06:11:28


In [4]:
df = df.rename(columns={"userId": "userId:token", "movieId": "movieId:token", "timestamp_int": "timestamp:float"})


In [5]:
test_part = 0.2
train_len = int(df.shape[0] * (1 - test_part))
test_len = df.shape[0] - train_len 
df_train, df_test = df.drop(['timestamp'], axis=1).head(train_len), df.drop(['timestamp'], axis=1).tail(test_len)

In [6]:
df_train.head()

Unnamed: 0,userId:token,movieId:token,rating
4182421,28507,1176,4.0
18950979,131160,1079,3.0
18950936,131160,47,5.0
18950930,131160,21,3.0
12341178,85252,45,3.0


In [7]:
N_USERS = df['userId:token'].max() + 1
N_ITEMS = df['movieId:token'].max() + 1

In [8]:
import torch

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [9]:
from math import log2

def apk(pred, target, k):
    if len(pred) >= k:
        pred = pred[:k]

    ans, cnt = 0, 0
    tot = len(pred) 
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            cnt += 1
            ans += cnt / (i + 1)
            s.add(pred[i])
    return ans / tot


def mapk(pred, target, k):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += apk(cur_pred, cur_target, k)
    return sum_metric / len(pred) 


def mr(pred, target):
    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1 / (i + 1)
            s.add(pred[i])
            break
    return ans

def mrr(pred, target):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += mr(cur_pred, cur_target)
    return sum_metric / len(pred) 


def ndcgunique(pred, target):
    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1 / (log2(i + 2))
            s.add(pred[i])
    return ans / tot

def ndcg(pred, target):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += ndcgunique(cur_pred, cur_target)
    return sum_metric / len(pred) 


def precisionunique(pred, target, k):
    if len(pred) >= k:
        pred = pred[:k]

    ans, cnt = 0, 0
    tot = len(pred)
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            ans += 1
            s.add(pred[i])
    return ans / tot


def precision(pred, target, k):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += precisionunique(cur_pred, cur_target, k)
    return sum_metric / len(pred) 

In [10]:
class EmbeddingMethodsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        return {
            'user' : torch.LongTensor([df['userId:token'].iloc[index]]).item(),
            'item' : torch.LongTensor([df['movieId:token'].iloc[index]]).item(),
            'rating' : torch.FloatTensor([df['rating'].iloc[index]]).item()
        }

    def __len__(self):
        return self.df.shape[0]

In [11]:
dataset = EmbeddingMethodsDataset(df)
loader = torch.utils.data.DataLoader(dataset, batch_size=64)

## SLIM

In [12]:
import torch.nn as nn 

class SLIM(nn.Module):
    def __init__(self, n_users, n_items, hidden_dim):
        super().__init__()

        self.embeddings_users = nn.Embedding(n_users, hidden_dim)
        self.embeddings_items = nn.Embedding(n_items, hidden_dim) 

    def forward(self, users, items):
        users_hidden = self.embeddings_users(users)
        items_hidden = self.embeddings_items(items)

        return torch.sum(users_hidden * items_hidden, -1), users_hidden, items_hidden

In [13]:
class SLIMLoss(nn.Module):
    def __init__(self, l, b):
        super().__init__()
        self.l = l 
        self.b = b
        self.mse = nn.MSELoss()
        self.mae = nn.L1Loss()

    def forward(self, user_embeddings, item_embeddings, pred, target):
        l2 = self.mse(user_embeddings.float(), user_embeddings.float()) + self.mse(item_embeddings.float(), item_embeddings.float())
        l1 = self.mae(user_embeddings.float(), user_embeddings.float()) + self.mae(item_embeddings.float(), item_embeddings.float())
        diff = self.mse(pred.float(), target.float())
        return diff + self.l * l1 + self.b * l2

In [14]:
model = SLIM(N_USERS, N_ITEMS, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = SLIMLoss(0.1, 0.1).to(device)

In [15]:
from tqdm.auto import trange, tqdm

epochs = 3

for _ in trange(epochs):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred, user_emb, item_emb = model(batch['user'], batch['item'])
        loss = criterion(user_emb, item_emb, pred, batch['rating']) + 0.0
        loss.backward()

        optimizer.step()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

In [16]:
torch.save(model, 'slim.pt')

In [17]:


class FastKNN:
    def __init__(self, num_projections = 10):
        self.num_projections = num_projections
        self.distributions = []
        self.projection_matrixes = []
        self.embeddings = None

    def binary_search(self, index_projection, find_value):
        left_index, right_index = 0, len(self.distributions[index_projection]) - 1
        while left_index < right_index - 1:
            middle_index = (left_index + right_index) // 2
            middle_value = self.distributions[index_projection][middle_index][0]
            if middle_value < find_value:
                left_index = middle_index
            else:
                right_index = middle_index

        return left_index

    def find_k_nearest(self, index_projection, find_value, k):
        i0 = self.binary_search(index_projection, find_value)
        left_index = max(0, i0 - k // 2)
        right_index = min(len(self.distributions[index_projection]), left_index + k )
        if right_index - left_index != k:
            left_index = right_index - k
        return [self.distributions[index_projection][i][1] for i in range(left_index, right_index)]

    def fit(self, vectors):
        for i in trange(self.num_projections):
            projection = torch.randn((vectors.shape[1], 1))
            values = (vectors @ projection).cpu().numpy().reshape(-1)
            self.distributions.append(sorted([(values[j], j) for j in range(len(values))]))
            self.projection_matrixes.append(projection.cpu())

    def predict(self, vector, k):
        k0 = 1 + (k // self.num_projections)
        values = [torch.sum(vector * proj.reshape(-1)).item() for proj in self.projection_matrixes]
        neighb = []
        for i, value in enumerate(values):
            cur_n = self.find_k_nearest(i, value, k0)
            neighb += cur_n
        neighb = list(set(neighb))
        return neighb[:k]

In [18]:
import numpy as np

vectors = model.embeddings_items.weight.cpu().detach()
knn = FastKNN()
knn.fit(vectors)

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
import random

len(knn.predict(vectors[64497], 15))

15

In [20]:
vectors.shape

torch.Size([131263, 64])

In [21]:
g_test = [[] for i in range(df['userId:token'].max() + 1)]
for i in trange(df_test.shape[0]):
    usr, item = df_test['userId:token'].iloc[i], df_test['movieId:token'].iloc[i]
    g_test[usr].append(item)


  0%|          | 0/4000053 [00:00<?, ?it/s]

In [22]:
g_train = [[] for i in range(df_train['userId:token'].max() + 1)]
for i in trange(df_train.shape[0]):
    cur_user = df['userId:token'].iloc[i]
    cur_item = df['movieId:token'].iloc[i]
    cur_rating = df['rating'].iloc[i]
    g_train[cur_user].append((cur_rating, cur_item))

for i in range(len(g_train)):
    g_train[i] = sorted(g_train[i])
    g_train[i] = g_train[i][-5:]


  0%|          | 0/16000210 [00:00<?, ?it/s]

In [23]:
predictions = []
target = []
k = 40
for i in trange(df_train['userId:token'].max() + 1):
    if len(g_test[i]) == 0 or len(g_train[i]) == 0:
        continue
    out = []
    for l in g_train[i]:
        cur_item = l[1]
        neighb = knn.predict(vectors[cur_item], k)[:(k // 5)]
        for e in neighb:
            out.append(e)
    
    predictions.append(out)
    target.append(g_test[i])


  0%|          | 0/138494 [00:00<?, ?it/s]

In [24]:
for i in range(1, 20):
    print(f"precision@{i} : {precision(predictions, target, i)}")
print(f"ndcg: {ndcg(predictions, target)}")
print(f"mrr: {mrr(predictions, target)}")
for i in range(1, 20):
    print(f"mapk@{i} : {mapk(predictions, target, i)}")


precision@1 : 0.005139110402268297
precision@2 : 0.006290979975190502
precision@3 : 0.005907023450883094
precision@4 : 0.005626439836966153
precision@5 : 0.005812511075668958
precision@6 : 0.0057593478646110335
precision@7 : 0.0053416369205842775
precision@8 : 0.0052720184299131665
precision@9 : 0.005080040167759451
precision@10 : 0.005387205387205412
precision@11 : 0.0054774217453643186
precision@12 : 0.005508299367948471
precision@13 : 0.005602584549952955
precision@14 : 0.00549353180932132
precision@15 : 0.005493531809321268
precision@16 : 0.005416002126528443
precision@17 : 0.005257685211245566
precision@18 : 0.00556983086222857
precision@19 : 0.005587966460542684
ndcg: 0.001474809113499101
mrr: 0.020730975743407275
mapk@1 : 0.005139110402268297
mapk@2 : 0.004474570264043948
mapk@3 : 0.0035934392659538843
mapk@4 : 0.0030162738496071835
mapk@5 : 0.0026894677771870704
mapk@6 : 0.002408588812097584
mapk@7 : 0.0021223694156024967
mapk@8 : 0.0019318340042024265
mapk@9 : 0.00176094151077

## ALS

In [25]:
model = SLIM(N_USERS, N_ITEMS, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = SLIMLoss(0.1, 0.0).to(device)

In [26]:
from tqdm.auto import trange, tqdm

epochs = 3

for _ in trange(epochs):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred, user_emb, item_emb = model(batch['user'], batch['item'])
        loss = criterion(user_emb, item_emb, pred, batch['rating']) + 0.0
        loss.backward()

        optimizer.step()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

In [27]:
import numpy as np

vectors = model.embeddings_items.weight.cpu().detach()
knn = FastKNN()
knn.fit(vectors)

predictions = []
target = []
k = 40
for i in trange(df_train['userId:token'].max() + 1):
    if len(g_test[i]) == 0 or len(g_train[i]) == 0:
        continue
    out = []
    for l in g_train[i]:
        cur_item = l[1]
        neighb = knn.predict(vectors[cur_item], k)[:(k // 5)]
        for e in neighb:
            out.append(e)
    
    predictions.append(out)
    target.append(g_test[i])

for i in range(1, 20):
    print(f"precision@{i} : {precision(predictions, target, i)}")
print(f"ndcg: {ndcg(predictions, target)}")
print(f"mrr: {mrr(predictions, target)}")
for i in range(1, 20):
    print(f"mapk@{i} : {mapk(predictions, target, i)}")


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/138494 [00:00<?, ?it/s]

precision@1 : 0.004075846181109339
precision@2 : 0.00354421407052986
precision@3 : 0.004548408057179982
precision@4 : 0.0052720184299131665
precision@5 : 0.005245436824384181
precision@6 : 0.0056412073955933745
precision@7 : 0.005468215994531771
precision@8 : 0.005427077795498848
precision@9 : 0.005276940949455554
precision@10 : 0.005192273613326268
precision@11 : 0.005139110402268286
precision@12 : 0.005404926457558012
precision@13 : 0.005316321105794775
precision@14 : 0.005544163438900321
precision@15 : 0.005410833481008905
precision@16 : 0.005338472443735602
precision@17 : 0.00530785147658211
precision@18 : 0.005290477878197222
precision@19 : 0.005228298683977394
ndcg: 0.0014307231625157065
mrr: 0.018885980910737354
mapk@1 : 0.004075846181109339
mapk@2 : 0.002835371256423888
mapk@3 : 0.0026581605528973942
mapk@4 : 0.002469874180400498
mapk@5 : 0.002181463760411126
mapk@6 : 0.0020443223659597914
mapk@7 : 0.0018463064828979597
mapk@8 : 0.0017041235242989641
mapk@9 : 0.0015782222734789

In [28]:
torch.save(model, 'als.pt')

## Implicit ALS

In [29]:
class ImplicitALSLoss(nn.Module):
    def __init__(self, df):
        super().__init__()
        self.df = df
        self.user_degree = df['userId:token'].value_counts()
        self.users_set = set(self.user_degree.index)

    def forward(self, user_ids, pred, target):
        weights = torch.tensor([self.user_degree[x.item()] if x.item() in self.users_set else 1 for x in user_ids]).to(device)
        difference = (pred - target) * (pred - target)
        return torch.sum(weights * difference)

In [30]:
model = SLIM(N_USERS, N_ITEMS, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = ImplicitALSLoss(df_train).to(device)

In [31]:
from tqdm.auto import trange, tqdm

epochs = 3

for _ in trange(epochs):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred, user_emb, item_emb = model(batch['user'], batch['item'])
        loss = criterion(batch['user'], pred, batch['rating']) + 0.0
        loss.backward()

        optimizer.step()  

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

In [32]:
import numpy as np

vectors = model.embeddings_items.weight.cpu().detach()
knn = FastKNN()
knn.fit(vectors)

predictions = []
target = []
k = 40
for i in trange(df_train['userId:token'].max() + 1):
    if len(g_test[i]) == 0 or len(g_train[i]) == 0:
        continue
    out = []
    for l in g_train[i]:
        cur_item = l[1]
        neighb = knn.predict(vectors[cur_item], k)[:(k // 5)]
        for e in neighb:
            out.append(e)
    
    predictions.append(out)
    target.append(g_test[i])

for i in range(1, 20):
    print(f"precision@{i} : {precision(predictions, target, i)}")
print(f"ndcg: {ndcg(predictions, target)}")
print(f"mrr: {mrr(predictions, target)}")
for i in range(1, 20):
    print(f"mapk@{i} : {mapk(predictions, target, i)}")


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/138494 [00:00<?, ?it/s]

precision@1 : 0.003012581959950381
precision@2 : 0.0034556087187666137
precision@3 : 0.0032488628979857053
precision@4 : 0.0033227006911217436
precision@5 : 0.003437887648413958
precision@6 : 0.0034260736015121985
precision@7 : 0.0036454773296878498
precision@8 : 0.0036328194222931065
precision@9 : 0.003563904148699461
precision@10 : 0.0035796562112351596
precision@11 : 0.00352810400657291
precision@12 : 0.0036032843050386896
precision@13 : 0.003626003626003617
precision@14 : 0.003594845700108866
precision@15 : 0.0037450528678598764
precision@16 : 0.0037435761119971648
precision@17 : 0.0037344549728450544
precision@18 : 0.0037263472935987656
precision@19 : 0.0037004392960071717
ndcg: 0.0010193366822890942
mrr: 0.013929539240239127
mapk@1 : 0.003012581959950381
mapk@2 : 0.002480949849370902
mapk@3 : 0.001969007816961033
mapk@4 : 0.001698269242128891
mapk@5 : 0.001528737669088545
mapk@6 : 0.001367475928879437
mapk@7 : 0.0012733854839117984
mapk@8 : 0.001169590643274855
mapk@9 : 0.0010790

In [33]:
torch.save(model, 'implicitals.pt')

## SVD++

In [34]:
import torch.nn as nn 

class SVDPlusPlus(nn.Module):
    def __init__(self, n_users, n_items, hidden_dim):
        super().__init__()

        self.b_embeddings_users = nn.Embedding(n_users, 1)
        self.b_embeddings_items = nn.Embedding(n_items, 1)

        self.embeddings_users = nn.Embedding(n_users, hidden_dim)
        self.embeddings_items = nn.Embedding(n_items, hidden_dim) 

    def forward(self, users, items):
        users_b = self.b_embeddings_users(users).squeeze(1)
        items_b = self.b_embeddings_items(items).squeeze(1)

        users_hidden = self.embeddings_users(users)
        items_hidden = self.embeddings_items(items)

        return torch.sum(users_hidden * items_hidden, -1) + users_b + items_b, users_hidden, items_hidden

In [35]:
model = SVDPlusPlus(N_USERS, N_ITEMS, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = SLIMLoss(0.1, 0.0).to(device)

In [36]:
from tqdm.auto import trange, tqdm

epochs = 3

for _ in trange(epochs):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        pred, user_emb, item_emb = model(batch['user'], batch['item'])
        loss = criterion(user_emb, item_emb, pred, batch['rating']) + 0.0
        loss.backward()

        optimizer.step()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

  0%|          | 0/312505 [00:00<?, ?it/s]

In [37]:
import numpy as np

vectors = model.embeddings_items.weight.cpu().detach()
knn = FastKNN()
knn.fit(vectors)

predictions = []
target = []
k = 40
for i in trange(df_train['userId:token'].max() + 1):
    if len(g_test[i]) == 0 or len(g_train[i]) == 0:
        continue
    out = []
    for l in g_train[i]:
        cur_item = l[1]
        neighb = knn.predict(vectors[cur_item], k)[:(k // 5)]
        for e in neighb:
            out.append(e)
    
    predictions.append(out)
    target.append(g_test[i])

for i in range(1, 20):
    print(f"precision@{i} : {precision(predictions, target, i)}")
print(f"ndcg: {ndcg(predictions, target)}")
print(f"mrr: {mrr(predictions, target)}")
for i in range(1, 20):
    print(f"mapk@{i} : {mapk(predictions, target, i)}")


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/138494 [00:00<?, ?it/s]

precision@1 : 0.006379585326953748
precision@2 : 0.005936558568137515
precision@3 : 0.005788882981865431
precision@4 : 0.006645401382243487
precision@5 : 0.006592238171185537
precision@6 : 0.00747238466536711
precision@7 : 0.007392217918533727
precision@8 : 0.007066276803118908
precision@9 : 0.007147498375568568
precision@10 : 0.007159312422470361
precision@11 : 0.006959547629404103
precision@12 : 0.007339476637722264
precision@13 : 0.007361059992638983
precision@14 : 0.007506139085086453
precision@15 : 0.0074428495481127606
precision@16 : 0.007265638844586213
precision@17 : 0.007143154975972357
precision@18 : 0.007053970504262983
precision@19 : 0.006974173871680872
ndcg: 0.0018744111528039926
mrr: 0.025350579771252026
mapk@1 : 0.006379585326953748
mapk@2 : 0.004607478291688818
mapk@3 : 0.0037214247740563525
mapk@4 : 0.003378079035973773
mapk@5 : 0.0029930887825624654
mapk@6 : 0.002824049461476362
mapk@7 : 0.0025833583477693767
mapk@8 : 0.0023518128233040517
mapk@9 : 0.0022020773970091

In [38]:
torch.save(model, 'svdplusplus.pt')