# Introduction

In this notebook, I will create an algorithm for embedding movies using Autoencoder. For each user I will calculate mean vector of the favourite movies and search the most similar movies to recommend them.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.model_selection import train_test_split
from numpy import dot
from numpy.linalg import norm

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)

In [3]:
movies = pd.read_csv('../data/interim/u.item', index_col=0)

In [4]:
movies.head()

Unnamed: 0,movieId,movieTitle,releaseDate,URL,genre2,genre3,genre4,genre5,genre6,genre7,...,genre10,genre11,genre12,genre13,genre14,genre15,genre16,genre17,genre18,genre19
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0.0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0.0


In [5]:
titles = movies['movieTitle'].values

vect = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
title_vectors = vect.fit_transform(titles).toarray()

In [6]:
genres = np.array([x for x in movies[[f'genre{i}' for i in range(2, 20)]].to_numpy()])

In [7]:
X = np.hstack((title_vectors, genres))

In [8]:
split_idx = int(0.8 * len(movies))
train_data = torch.from_numpy(X[:split_idx]).float()
test_data = torch.from_numpy(X[split_idx:]).float()

In [9]:
class DenseAutoencoder(nn.Module):
    def __init__(self, input_size, emb_size=32):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, emb_size)
        )

        self.decoder = nn.Sequential(
            nn.Linear(emb_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, input_size),
            nn.ReLU()
        )

    def forward(self, x):
        embeddings = self.encoder(x)
        reconstructions = self.decoder(embeddings)
        return reconstructions

    def encode(self, x):
        return self.encoder(x)

In [10]:
model = DenseAutoencoder(X.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:
best_val_loss = float('inf')

num_epochs = 1000
bar = tqdm(range(num_epochs))
for epoch in bar:
    model.train()
    output = model(train_data)
    loss = criterion(output, train_data)

    reg = 0
    for name, param in model.encoder.named_parameters():
        if 'weight' in name:
            reg += torch.norm(param, 2)
    loss += 0.001 * reg
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_output = model(test_data)
        val_loss = criterion(test_output, test_data).item()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), '../models/enc_model.pth')

    bar.set_postfix_str(f'Epoch {epoch + 1}, Train loss: {loss.item()}, Best validation loss: {best_val_loss}')

100%|██████████| 1000/1000 [03:25<00:00,  4.86it/s, Epoch 1000, Train loss: 0.0005312236025929451, Best validation loss: 0.00021392774942796677]


In [12]:
best_model = DenseAutoencoder(X.shape[1])
best_model.load_state_dict(torch.load('../models/enc_model.pth'))

<All keys matched successfully>

In [13]:
movie_embeddings = {}

with torch.no_grad():
    best_model.eval()
    for i, movie in movies.iterrows():
        try:
            title_vector = title_vectors[i].reshape(1, -1)
            genre = genres[i].reshape(1, -1)
            movie_embeddings[movie.movieTitle] = best_model.encode(
                torch.from_numpy(np.hstack((title_vector, genre))).float()).detach().numpy().reshape(-1)
        except IndexError:
            continue

In [14]:
movie_embeddings = pd.DataFrame(data=movie_embeddings)
movie_embeddings.to_csv('../data/interim/movie_embeddings.csv')

# Test embedding


In [15]:
ratings = pd.read_csv('../data/interim/u.data', index_col=0)

In [16]:
train, test = train_test_split(ratings, test_size=0.2, stratify=ratings['user'])

In [17]:
movie_embeddings = pd.read_csv('../data/interim/movie_embeddings.csv', index_col=0)

In [18]:
def get_movie_embedding(movie_id):
    movie_title = movies.query(f'movieId == {movie_id}').movieTitle
    return movie_embeddings[movie_title].values

In [19]:
user_embeddings = {}

In [20]:
for i, row in train.iterrows():
    try:
        user_embeddings[row.user] = user_embeddings.get(row.user, np.zeros((32, 1))) + [get_movie_embedding(row[1])]
    except KeyError:
        continue

In [21]:
for k, v in user_embeddings.items():
    user_embeddings[k] = np.mean(v, axis=0).reshape(-1)

In [22]:
def cosine_sim(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))

In [23]:
def get_recs(user_id):

    user_vector = user_embeddings.get(user_id, [])
    user_movies = train.query(f'user == {user_id}').merge(movies, left_on='item', right_on='movieId').movieTitle.values
    movie_list = []
    rating_list = []

    for k, v in movie_embeddings.items():
        movie_list.append(k)
        rating_list.append(cosine_sim(user_vector, v))

    index_sorted = np.argsort(rating_list)[::-1]
    movie_list = np.array(movie_list)[index_sorted]
    return [title for title in movie_list if title not in user_movies]

In [24]:
def precision_at_k(actual, predicted, k=10):

    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / min(k, len(pred_set))
    return result

def recall_at_k(actual, predicted, k=10):

    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / len(act_set)
    return result


def evaluate(test):

    metrics = []
    grouped = test.sort_values(by='rating').groupby(by='user')['item'].apply(np.array)
    for user_id, user_movies in grouped.items():

        actual = movies.query(f'movieId in {list(user_movies)}').movieTitle.values
        predicted = get_recs(user_id)
        k = 5
        if actual.shape[0] >= k:
            p5 = precision_at_k(actual, predicted, k=k)
            r5 = recall_at_k(actual, predicted, k=k)

            metrics.append([p5, r5])

    metrics = pd.DataFrame(metrics, columns=['P@5', 'R@5'])

    return metrics.mean()

In [25]:
evaluate(test)

P@5    0.013441
R@5    0.002508
dtype: float64

The result is worse than baseline solution, so this approach doesn't work.