In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)

In [3]:
movies = pd.read_csv('../data/interim/u.item', index_col=0)

In [4]:
movies.head()

Unnamed: 0,movieId,movieTitle,releaseDate,URL,genre1,genre2,genre3,genre4,genre5,genre6,...,genre10,genre11,genre12,genre13,genre14,genre15,genre16,genre17,genre18,genre19
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0.0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0


In [5]:
titles = movies['movieTitle'].values

vect = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
title_vectors = vect.fit_transform(titles).toarray()

In [6]:
genres = np.array([x for x in movies[[f'genre{i}' for i in range(1, 20)]].to_numpy()])

In [7]:
X = np.hstack((title_vectors, genres))

In [8]:
split_idx = int(0.8 * len(movies))
train_data = torch.from_numpy(X[:split_idx]).float()
test_data = torch.from_numpy(X[split_idx:]).float()

In [9]:
class DenseAutoencoder(nn.Module):
    def __init__(self, input_size, emb_size=32):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, emb_size)
        )

        self.decoder = nn.Sequential(
            nn.Linear(emb_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        embeddings = self.encoder(x)
        reconstructions = self.decoder(embeddings)
        return reconstructions

    def encode(self, x):
        return self.encoder(x)

In [10]:
model = DenseAutoencoder(X.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:
best_val_loss = float('inf')

num_epochs = 700
bar = tqdm(range(num_epochs))
for epoch in bar:
    model.train()
    output = model(train_data)
    loss = criterion(output, train_data)

    reg = 0
    for name, param in model.encoder.named_parameters():
        if 'weight' in name:
            reg += torch.norm(param, 2)
    loss += 0.001 * reg
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_output = model(test_data)
        val_loss = criterion(test_output, test_data).item()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), '../models/enc_model.pth')

    bar.set_postfix_str(f'Epoch {epoch + 1}, Train loss: {loss.item()}, Best validation loss: {best_val_loss}')

100%|██████████| 700/700 [02:40<00:00,  4.36it/s, Epoch 700, Train loss: 0.005276366136968136, Best validation loss: 0.00024008699983824044] 


In [12]:
best_model = DenseAutoencoder(X.shape[1])
best_model.load_state_dict(torch.load('../models/enc_model.pth'))

Test loss: 0.0004304506292100996


In [13]:
movie_embeddings = {}

with torch.no_grad():
    best_model.eval()
    for i, movie in movies.iterrows():
        title_vector = title_vectors[i].reshape(1, -1)
        genre = genres[i].reshape(1, -1)
        movie_embeddings[movie.movieTitle] = best_model.encode(
            torch.from_numpy(np.hstack((title_vector, genre))).float()).detach().numpy().reshape(-1)

In [14]:
movie_embeddings = pd.DataFrame(data=movie_embeddings)
movie_embeddings.to_csv('../data/interim/movie_embeddings.csv')

# Test embedding

I'm creating an embedding for one movie and trying to find the most similar movie using embeddings and cosine similarity.

In [21]:
from numpy import dot
from numpy.linalg import norm


def cosine_sim(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))

In [22]:
target = movies.iloc[900].movieTitle
target_emb = movie_embeddings[target]

In [23]:
target

'Big Lebowski, The (1998)'

In [26]:
movie_list = []
rating_list = []

for k, v in movie_embeddings.items():
    movie_list.append(k)
    rating_list.append(cosine_sim(target_emb, v))

In [27]:
index_sorted = np.argsort(rating_list)
rating_list = np.array(rating_list)[index_sorted]
movie_list = np.array(movie_list)[index_sorted]

In [31]:
rating_list[-5:], movie_list[-5:]

(array([0.9993614 , 0.9993798 , 0.99940467, 0.9997516 , 1.0000001 ],
       dtype=float32),
 array(['North by Northwest (1959)', 'Head Above Water (1996)',
        'Another Stakeout (1993)', 'Arsenic and Old Lace (1944)',
        'Big Lebowski, The (1998)'], dtype='<U81'))

In [32]:
movies.query('movieTitle in ("Big Lebowski, The (1998)", "Arsenic and Old Lace (1944)", "Another Stakeout (1993)")')

Unnamed: 0,movieId,movieTitle,releaseDate,URL,genre1,genre2,genre3,genre4,genre5,genre6,...,genre10,genre11,genre12,genre13,genre14,genre15,genre16,genre17,genre18,genre19
569,571,Another Stakeout (1993),01-Jan-1993,http://us.imdb.com/M/title-exact?Another%20Sta...,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0.0
657,659,Arsenic and Old Lace (1944),01-Jan-1944,http://us.imdb.com/M/title-exact?Arsenic%20and...,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0.0
900,902,"Big Lebowski, The (1998)",26-Dec-1997,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0.0
