In [66]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
movies = pd.read_csv('../data/interim/u.item', index_col=0)

In [85]:
movies.head()

Unnamed: 0,movieId,movieTitle,releaseDate,URL,genre1,genre2,genre3,genre4,genre5,genre6,...,genre10,genre11,genre12,genre13,genre14,genre15,genre16,genre17,genre18,genre19
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0.0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0


In [69]:
titles = movies['movieTitle'].values

vect = CountVectorizer(ngram_range=(1, 3), analyzer='char')
title_vectors = vect.fit_transform(titles).toarray()

In [70]:
genres = np.array([x for x in movies[[f'genre{i}' for i in range(1, 20)]].to_numpy()])

In [71]:
X = np.hstack((title_vectors, genres))

In [72]:
split_idx = int(0.8 * len(movies))
train_data = torch.from_numpy(X[:split_idx]).type('torch.FloatTensor')
test_data = torch.from_numpy(X[split_idx:]).type('torch.FloatTensor')

In [73]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, int(hidden_size / 2)),
        )
        self.decoder = nn.Sequential(
            nn.Linear(int(hidden_size / 2), hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encode(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def encode(self, x):
        return self.encoder(x)

In [74]:
model = Autoencoder(X.shape[1], 64)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
bar = tqdm(range(num_epochs))
for epoch in bar:
    output = model(train_data)
    loss = criterion(train_data, output)
    loss.backward()
    optimizer.step()
    bar.set_postfix_str(f'Loss: {loss.item()}')

100%|██████████| 100/100 [00:11<00:00,  8.87it/s, Loss: 0.013170290738344193]


In [75]:
test_output = model(test_data)
test_loss = criterion(test_output, test_data)

print('Test loss:', test_loss.item())

Test loss: 0.013187039643526077


In [78]:
movie_embeddings = {}

for i, movie in movies.iterrows():
    title_vector = title_vectors[i]
    genre = genres[i]
    movie_embeddings[movie.movieTitle] = model.encode(torch.from_numpy(np.hstack((title_vector, genre))).float()).detach().numpy()

In [84]:
movie_embeddings = pd.DataFrame(data=movie_embeddings)
movie_embeddings.to_csv('../data/interim/movie_embeddings.csv')