In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM
import multiprocessing as mp

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import functional as F
import random

# Dataset

Использую датасет movielens-1m с первого дз. Датасет популярный, с ним был уже опыт работы, а также -- его используют в статье про NCF

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [3]:
ratings = ratings.loc[(ratings['rating'] >= 4)]
users = ratings["user_id"]
movies = ratings["movie_id"]
user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
user_item_csr = user_item.tocsr()

In [4]:
unique_users = np.unique(users)
unique_movies = set(np.unique(movies))

grouped_interactions = ratings.groupby('user_id')['movie_id'].apply(list)

train_dataset = {}
test_dataset = {}
negative_dataset = {}

for user_id, user_movies in grouped_interactions.iteritems():
    if len(user_movies) < 2:
        continue

    train_dataset[user_id] = user_movies[:-1]
    test_dataset[user_id] = user_movies[-1]
    negative_dataset[user_id] =  list(unique_movies - set(user_movies))
    
total_users = list(test_dataset.keys())

In [5]:
def extract_csr_data(interactions):
    coo_users = []
    for user_id in interactions:
        coo_users.append(np.full(len(interactions[user_id]), user_id))
    
    coo_users = np.hstack(coo_users)
    coo_movies = []
    for user_id in interactions:
        coo_movies.append(np.array(interactions[user_id]))
    
    coo_movies = np.hstack(coo_movies)
    
    user_item = sp.coo_matrix((np.ones_like(coo_users), (coo_users, coo_movies)))
    return user_item.tocsr()

train_data = extract_csr_data(train_dataset)

# Evaluation 

Аналогично статье из NCF будем сравнивать все модели по метрикам Hit rate(HR@K) и NDCG@K. K = 10
Помимо одного позитива, также добавим 99 случайных негативных фильмов для пользователя, тем самым будем оценивать эти метрики относительно ранжирования этих 1 + 99 фильмов.

In [6]:
def evaluate_metrics_for_user(args):
    k = 10
    model, user_id = args
    if user_id not in negative_dataset[user_id]:
        return None
    
    last_user_movie = test_dataset[user_id]
    np.random.shuffle(negative_dataset[user_id])
    random_negative_movies = negative_dataset[user_id][:99]

    input_movies = np.array([last_user_movie] + list(random_negative_movies))
    input_user = np.full(len(input_movies), user_id)
    
    pred = model.predict(input_user, input_movies)

    top_movies = input_movies[np.argsort(pred)[-k:]]

    hit_rate = 1 if last_user_movie in top_movies else 0

    ndcg = 0
    for position, movie in enumerate(top_movies):
        if movie == last_user_movie:
            ndcg = np.log(2) / np.log(position + 2)
            break
    return hit_rate, ndcg

def evaluate_model(model):
    with mp.Pool(mp.cpu_count()) as pool:
        users_len = len(total_users)
        metrics = pool.map(evaluate_metrics_for_user, zip([model] * users_len, total_users))
        hrs = [metric[0] for metric in metrics if metric is not None]
        ndcgs = [metric[1] for metric in metrics if metric is not None]

    print('Mean HR', np.mean(hrs))
    print('Mean NDCG', np.mean(ndcgs))

Аналогично 1 дз будем смотреть на симилары истории игрушек и рекоммендации для 4 пользователя

In [7]:
class Recommender:
    
    def __init__(self, model, user_emb, item_emb, bias_u=None, bias_i=None):
        self.model = model
        self.user_emb = user_emb
        self.user_bias = bias_u
        self.item_emb = item_emb
        self.item_bias = bias_i
        
    def predict(self, users, movies):
        return self.model.predict(users, movies)
    
    def similars(self, toy_movie_id=1, top=10):
        input_vector = self.item_emb[toy_movie_id]

        data = []
        for item_idx, column in enumerate(self.item_emb):
            dst = np.linalg.norm(column - input_vector)
            data.append((item_idx, dst))

        sorted_by_dst = list(sorted(data, key=lambda val: val[1]))

        similars = []
        for item in sorted_by_dst:
            search = movie_info[movie_info["movie_id"] == item[0]]
            movie_name = search["name"].to_string()
            if len(search) > 0:
                similars.append((item[0], movie_name))

        return similars[:top]

    def recommend(self, user_id=4, top=10):
        new_movie_ids = negative_dataset[user_id]

        data = []
        for movie_id in new_movie_ids:
            bias_w = self.user_bias[user_id] if self.user_bias is not None else 0
            bias_h = self.item_bias[movie_id] if self.item_bias is not None else 0

            dot = np.dot(self.user_emb[user_id], self.item_emb[movie_id])
            data.append((movie_id, dot + bias_w + bias_h))

        data = list(sorted(data, key=lambda val: val[1], reverse=True))
        recommendations = [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() for x in data]
        return recommendations[:top]

# Baseline MF model: LightFM warp

In [15]:
baseline = LightFM(
    no_components=64, 
    learning_rate=0.01,
    loss='warp',
    max_sampled=200
)

baseline.fit(train_data, epochs=40, num_threads=mp.cpu_count())

<lightfm.lightfm.LightFM at 0x7fe50e238290>

In [16]:
baseline_recommender = Recommender(
    baseline, 
    baseline.user_embeddings, 
    baseline.item_embeddings,
    baseline.user_biases,
    baseline.item_biases
)

In [17]:
evaluate_model(baseline_recommender)

Mean HR 0.6832171893147503
Mean NDCG 0.2570064577984302


In [18]:
baseline_recommender.similars()

[(1, '0    Toy Story (1995)'),
 (588, '584    Aladdin (1992)'),
 (3114, '3045    Toy Story 2 (1999)'),
 (2355, "2286    Bug's Life, A (1998)"),
 (1197, '1179    Princess Bride, The (1987)'),
 (1265, '1245    Groundhog Day (1993)'),
 (364, '360    Lion King, The (1994)'),
 (595, '591    Beauty and the Beast (1991)'),
 (1073, '1058    Willy Wonka and the Chocolate Factory (1971)'),
 (2321, '2252    Pleasantville (1998)')]

In [19]:
baseline_recommender.recommend()

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '847    Godfather, The (1972)',
 '585    Terminator 2: Judgment Day (1991)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '1182    Aliens (1986)',
 '1203    Godfather: Part II, The (1974)',
 '108    Braveheart (1995)',
 '2502    Matrix, The (1999)',
 '453    Fugitive, The (1993)',
 '537    Blade Runner (1982)']

# NCF

In [8]:
class NCF(nn.Module):
    def __init__(self, total_users, total_items, latent_size=64):
        super().__init__()
        
        # MLP
        self.user_embedding = nn.Embedding(total_users, latent_size)
        self.item_embedding = nn.Embedding(total_items, latent_size)
        
        self.mlp_net = nn.Sequential(
            nn.Linear(2 * latent_size, latent_size), 
            nn.ReLU(), 
            nn.Linear(latent_size, latent_size),
            nn.ReLU(), 
            nn.Linear(latent_size, latent_size),
        )
        
        # GMF
        self.user_gmf_embedding = nn.Embedding(total_users, latent_size)
        self.item_gmf_embedding = nn.Embedding(total_items, latent_size)
        
        self.rating_layer = nn.Sequential(
            nn.Linear(2 * latent_size, 1),
            nn.Sigmoid()
        )

    def forward(self, input_user, input_item):
        mlp_user_emb = self.user_embedding(input_user)
        mlp_item_emb = self.item_embedding(input_item)
        mlp_output   = self.mlp_net(torch.cat([mlp_user_emb, mlp_item_emb], dim=-1))
        
        gmf_user_emb = self.user_gmf_embedding(input_user)
        gmf_item_emb = self.item_gmf_embedding(input_item)
        gmf_output   = torch.mul(gmf_user_emb, gmf_item_emb)
        
        ncf_output   = torch.cat([mlp_output, gmf_output], dim=-1)
        return self.rating_layer(ncf_output)
    
    def predict(self, users, movies):
        return self(users, movies)

In [9]:
class NCF_Dataset(Dataset):
    def __init__(self, train_dataset, negative_dataset):        
        coo_users = []
        for user_id in train_dataset:
            coo_users.append(np.full(len(train_dataset[user_id]), user_id))

        coo_users = np.hstack(coo_users)
        coo_movies = []
        for user_id in train_dataset:
            coo_movies.append(np.array(train_dataset[user_id]))

        coo_movies = np.hstack(coo_movies)
        self.coo_users = coo_users
        self.coo_movies = coo_movies
        
        self.negative_data = negative_dataset
    
    def __getitem__(self, index):
        user_id = self.coo_users[index]
        item_id = self.coo_movies[index]
        negative_item_id = random.choice(self.negative_data[user_id])

        return user_id, item_id, negative_item_id
    
    def __len__(self):
        return len(self.coo_users)
    
dataset = NCF_Dataset(train_dataset, negative_dataset)

In [76]:
NCF_model = NCF(max(total_users) + 1, max(movies) + 1)
device = torch.device("cuda")
NCF_model.to(device)

NCF(
  (user_embedding): Embedding(6041, 64)
  (item_embedding): Embedding(3953, 64)
  (mlp_net): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=64, bias=True)
  )
  (user_gmf_embedding): Embedding(6041, 64)
  (item_gmf_embedding): Embedding(3953, 64)
  (rating_layer): Sequential(
    (0): Linear(in_features=128, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [77]:
optimizer = Adam(NCF_model.parameters(), lr=1e-3)

for epoch in range(100):
    total_loss = 0.0
    total_batches = 0
    trainloader = DataLoader(dataset, batch_size=2048, shuffle=True, num_workers=mp.cpu_count())
    for (batch_users, batch_movies, batch_negative_movies) in trainloader:
        batch_users = batch_users.to(device)
        batch_movies = batch_movies.to(device)
        batch_negative_movies = batch_negative_movies.to(device)

        ncf_positive_output = NCF_model(batch_users, batch_movies)
        positive_loss = F.binary_cross_entropy(ncf_positive_output, torch.ones_like(ncf_positive_output).float())
        
        ncf_negative_output = NCF_model(batch_users, batch_negative_movies)
        negative_loss = F.binary_cross_entropy(ncf_negative_output, torch.zeros_like(ncf_negative_output).float())
        
        loss = (positive_loss + negative_loss) / 2
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_batches += 1
    
    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f'epoch {epoch + 1}: loss {total_loss / total_batches}')

epoch 1: loss 0.5533991315596395
epoch 10: loss 0.4348882124792758
epoch 20: loss 0.3962825050028108
epoch 30: loss 0.35017433976955553
epoch 40: loss 0.31408113745047894
epoch 50: loss 0.2816089452813855
epoch 60: loss 0.25491754898278834
epoch 70: loss 0.23293632654834995
epoch 80: loss 0.2143845393610515
epoch 90: loss 0.20199512818948828
epoch 100: loss 0.19071783350526


In [23]:
ncf_item_emb = NCF_model.item_embedding.weight.detach().cpu().numpy()
ncf_user_emb = NCF_model.user_embedding.weight.detach().cpu().numpy()

In [24]:
ncf_recommender = Recommender(
    NCF_model, 
    ncf_item_emb, 
    ncf_user_emb,
)

In [17]:
def evaluate_torch_model(model, k=10):
    hrs = []
    ndcgs = []
    for user_id in negative_dataset.keys():
        last_user_movie = test_dataset[user_id]
        np.random.shuffle(negative_dataset[user_id])
        random_negative_movies = negative_dataset[user_id][:99]

        input_movies = torch.LongTensor([last_user_movie] + list(random_negative_movies))
        input_user = torch.LongTensor(np.full(len(input_movies), user_id))
        
        pred = model(input_user, input_movies).view(-1).detach().numpy()
        top_movies = input_movies[np.argsort(pred)[-k:]]
        hit_rate = 1 if last_user_movie in top_movies else 0

        ndcg = 0
        for position, movie in enumerate(top_movies):
            if movie == last_user_movie:
                ndcg = np.log(2) / np.log(position + 2)
                break
        
        hrs.append(hit_rate)
        ndcgs.append(ndcg)
        
    print('Mean HR', np.mean(hrs))
    print('Mean NDCG', np.mean(ndcgs))

In [45]:
evaluate_torch_model(NCF_model)

Mean HR 0.67318204406162
Mean NDCG 0.24632286384843266


In [72]:
ncf_recommender.similars()

[(1, '0    Toy Story (1995)'),
 (2277, '2208    Somewhere in the City (1997)'),
 (967, '955    Outlaw, The (1943)'),
 (213, '211    Burnt By the Sun (Utomlyonnye solntsem) (1994)'),
 (3565, '3496    Where the Heart Is (2000)'),
 (1114, '1098    Funeral, The (1996)'),
 (2289, '2220    Player, The (1992)'),
 (2170, '2101    Wrongfully Accused (1998)'),
 (3334, '3265    Key Largo (1948)'),
 (3621, '3552    Possession (1981)')]

In [11]:
def torch_recommend(torch_model, user_id=4, top=10):
    new_movie_ids = torch.LongTensor(negative_dataset[user_id])
    users = torch.LongTensor(np.full(len(new_movie_ids), user_id))
    data = torch_model.predict(users, new_movie_ids).view(-1).detach().numpy()
    data = list(zip(new_movie_ids.view(-1).detach().numpy(), data))
    data = list(sorted(data, key=lambda val: val[1], reverse=True))
    recommendations = [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() for x in data]
    return recommendations[:top]

In [59]:
torch_recommend(NCF_model)

['1203    Godfather: Part II, The (1974)',
 '847    Godfather, The (1972)',
 '1250    Back to the Future (1985)',
 '1539    Men in Black (1997)',
 '1575    L.A. Confidential (1997)',
 '585    Terminator 2: Judgment Day (1991)',
 '1284    Butch Cassidy and the Sundance Kid (1969)',
 '900    Casablanca (1942)',
 '2693    Sixth Sense, The (1999)',
 '957    African Queen, The (1951)']

Кажется, что рекоммендации ок, т.к. сделаны через предикт, а не через веса эмбеддингов, 
но с симиларами плохо скорее из-за того, что нужно несколько по-другому считать похожесть, хотя косинусное расстояние работает также плохо.

Формально метрики вышли примерно такими же(хуже на 1-2%) как у WARP модели

# Self Attention

In [12]:
class SA_Dataset(Dataset):
    def __init__(self, train_dataset, negative_dataset, fixed_len=10):
        self.train_dataset = train_dataset
        self.negative_dataset = negative_dataset
        self.fixed_len = fixed_len
        
    def __getitem__(self, index):
        user_id_index = index % len(train_dataset.keys())
        user_id = list(train_dataset.keys())[user_id_index]
        total_history = train_dataset[user_id]
        sampled_history = torch.LongTensor(random.choices(total_history, k=self.fixed_len))
        rest_targets = list(set(total_history) - set(sampled_history))
        target_movie = random.choice(rest_targets)
        negaitive_movie = random.choice(self.negative_dataset[user_id])
        
        return sampled_history, user_id, target_movie, negaitive_movie
    
    def __len__(self):
        return len(train_dataset) * 5
    
sa_dataset = SA_Dataset(train_dataset, negative_dataset)

In [13]:
class SelfAttention(nn.Module):
    def __init__(self, total_users, total_items, atten_heads=5, latent_size=64):
        super().__init__()
        self.user_embedding = nn.Embedding(total_users, latent_size)
        self.item_embedding = nn.Embedding(total_items, latent_size)
        self.heads = atten_heads

        self.attention = nn.MultiheadAttention(
            latent_size * self.heads, 
            self.heads, 
            kdim=latent_size, 
            vdim=latent_size
        )
        
        self.linear = nn.Sequential(
            nn.Linear(2 * latent_size + 2 * self.heads, latent_size),
            nn.ReLU(),
            nn.Linear(latent_size, latent_size),
            nn.ReLU(),
            nn.Linear(latent_size, 1),
            nn.Sigmoid()
        )
        
    def forward(self, user_history, user, item):
        history_embedding = self.item_embedding(user_history)
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        
        _, attention = self.attention(
            item_embedding.repeat(1, 1, self.heads),            
            history_embedding, 
            history_embedding
        )
        
        attention = torch.squeeze(attention)
        linear_output = self.linear(torch.cat([user_embedding, item_embedding, attention], dim=-1))
        return linear_output

In [20]:
sa_model = SelfAttention(max(total_users) + 1, max(movies) + 1)
device = torch.device("cuda")
sa_model.to(device)
optimizer = Adam(sa_model.parameters(), lr=1e-4)

for epoch in range(500):
    total_loss = 0.0
    total_batches = 0
    trainloader = DataLoader(sa_dataset, batch_size=2048, shuffle=True, num_workers=mp.cpu_count())
    for (batch_history, batch_users, batch_movies, batch_negative_movies) in trainloader:
        batch_history = batch_history.to(device)
        batch_users = batch_users.to(device)
        batch_movies = batch_movies.to(device)
        batch_negative_movies = batch_negative_movies.to(device)
        sa_positive_output = sa_model(batch_history, batch_users, batch_movies)
        positive_loss = F.binary_cross_entropy(sa_positive_output, torch.ones_like(sa_positive_output).float())
        
        sa_negative_output = sa_model(batch_history, batch_users, batch_negative_movies)
        negative_loss = F.binary_cross_entropy(sa_negative_output, torch.zeros_like(sa_negative_output).float())
        
        loss = (positive_loss + negative_loss) / 2
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_batches += 1
    
    if epoch == 0 or (epoch + 1) % 10 == 0:
        print(f'epoch {epoch + 1}: loss {total_loss / total_batches}')

epoch 1: loss 0.6935726245244344
epoch 10: loss 0.68640216588974
epoch 20: loss 0.6628908356030782
epoch 30: loss 0.6310411810874939
epoch 40: loss 0.5986151019732158
epoch 50: loss 0.5667280276616414
epoch 60: loss 0.5422204573949178
epoch 70: loss 0.5185796181360881
epoch 80: loss 0.49953468640645343
epoch 90: loss 0.48903797268867494
epoch 100: loss 0.4778030614058177
epoch 110: loss 0.46613449454307554
epoch 120: loss 0.46434036691983543
epoch 130: loss 0.45592422485351564
epoch 140: loss 0.452245432138443
epoch 150: loss 0.451547102133433
epoch 160: loss 0.4476594130198161
epoch 170: loss 0.4446143269538879
epoch 180: loss 0.4465136110782623
epoch 190: loss 0.44463562965393066
epoch 200: loss 0.4385597387949626
epoch 210: loss 0.435043728351593
epoch 220: loss 0.4365427533785502
epoch 230: loss 0.4395876208941142
epoch 240: loss 0.43452355861663816
epoch 250: loss 0.4353606879711151
epoch 260: loss 0.4355134646097819
epoch 270: loss 0.43369463880856834
epoch 280: loss 0.4333026746

In [24]:
sa_recommender = Recommender(
    sa_model.cpu(), 
    sa_model.user_embedding.weight.detach().cpu().numpy(),
    sa_model.item_embedding.weight.detach().cpu().numpy()
)

In [51]:
k=10
hrs = []
ndcgs = []
for user_id in negative_dataset.keys():
    last_user_movie = test_dataset[user_id]
    
    np.random.shuffle(negative_dataset[user_id])
    random_negative_movies = negative_dataset[user_id][:99]
    
    
    input_movies = torch.LongTensor([last_user_movie] + list(random_negative_movies))
    input_user = torch.LongTensor(np.full(len(input_movies), user_id))

    base_history = train_dataset[user_id][-10:]
    if len(base_history) < 10:
        base_history = random.choices(base_history, k=10)
    user_history = torch.LongTensor(base_history).repeat(len(input_movies), 1)

    pred = sa_model(user_history, input_user, input_movies).view(-1).detach().numpy()
    top_movies = input_movies[np.argsort(pred)[-k:]]
    hit_rate = 1 if last_user_movie in top_movies else 0

    ndcg = 0
    for position, movie in enumerate(top_movies):
        if movie == last_user_movie:
            ndcg = np.log(2) / np.log(position + 2)
            break

    hrs.append(hit_rate)
    ndcgs.append(ndcg)

print('Mean HR', np.mean(hrs))
print('Mean NDCG', np.mean(ndcgs))

Mean HR 0.5234387941030313
Mean NDCG 0.20840324458599013


In [52]:
sa_recommender.similars()

[(1, '0    Toy Story (1995)'),
 (3015, '2946    Coma (1978)'),
 (2235, "2166    One Man's Hero (1999)"),
 (1134, '1118    Johnny 100 Pesos (1993)'),
 (707, '698    Mulholland Falls (1996)'),
 (2316, '2247    Practical Magic (1998)'),
 (543, '539    So I Married an Axe Murderer (1993)'),
 (2792, '2723    Airplane II: The Sequel (1982)'),
 (1727, '1678    Horse Whisperer, The (1998)'),
 (114, "112    Margaret's Museum (1995)")]

In [62]:
def sa_recommend(sa_model, user_id=4, top=10):
    new_movie_ids = torch.LongTensor(negative_dataset[user_id])
    users = torch.LongTensor(np.full(len(new_movie_ids), user_id))
    user_history = torch.LongTensor(random.choices(train_dataset[user_id], k=10)).repeat(len(new_movie_ids), 1)
    data = sa_model(user_history, users, new_movie_ids).view(-1).detach().numpy()
    data = list(zip(new_movie_ids.view(-1).detach().numpy(), data))
    data = list(sorted(data, key=lambda val: val[1], reverse=True))
    recommendations = [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() for x in data]
    return recommendations[:top]

sa_recommend(sa_model)

['2789    American Beauty (1999)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '589    Silence of the Lambs, The (1991)',
 '2693    Sixth Sense, The (1999)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1250    Back to the Future (1985)',
 '585    Terminator 2: Judgment Day (1991)',
 '2327    Shakespeare in Love (1998)',
 "523    Schindler's List (1993)",
 '847    Godfather, The (1972)']

Здесь аналогично с симиларами и рекомендациями но с метриками уже похуже.