## Import Statements

In [17]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [39]:
# Load in data from csv files
movies_df = pd.read_csv("./Data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("./Data/ml-latest-small/ratings.csv")

print(f"Movie dataframe dimensions: {movies_df.shape}")
print(f"Ratings dataframe dimensions: {ratings_df.shape}")

# get number of unique users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

Movie dataframe dimensions: (9742, 3)
Ratings dataframe dimensions: (100836, 4)


## Test

In [23]:
class Model(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user and item embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.movie_factors = torch.nn.Embedding(n_items, n_factors)
        # fills weights with values from a uniform distribution [0, 0.5]
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.movie_factors.weight.data.uniform_(0, 0.05)
    
    def forward(self, data):
        # matrix multiplication between user and item factors, and then concatenates them to one column
        return (self.user_factors(data[:,0])*self.movie_factors(data[:,1])).sum(1)

In [24]:
class MovieDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

        le = preprocessing.LabelEncoder()
        le.fit(self.ratings.movieId.values)
        self.lookup = dict(zip(le.transform(self.ratings.movieId.values), self.ratings.movieId.values))

        self.ratings.userId = preprocessing.LabelEncoder().fit_transform(self.ratings.userId.values)
        self.ratings.movieId = preprocessing.LabelEncoder().fit_transform(self.ratings.movieId.values)

        self.x = torch.tensor(self.ratings.drop(['rating', 'timestamp'], axis=1).values)
        self.y = torch.tensor(self.ratings['rating'].values)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, item):
        return (self.x[item], self.y[item])

Train Model

In [40]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = Model(n_users, n_items, n_factors=8)

for name, param in model.named_parameters():
    # prints the parameters who's changes will be recorded
    if param.requires_grad:
        print(name, param.data)

# enable GPU if you have a GPU
if cuda:
    model = model.cuda()

# MSE loss function
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = MovieDataset(ratings_df)
train_loader = DataLoader(train_set, 128, shuffle=True)

for it in range(num_epochs):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Is running on GPU: False
user_factors.weight tensor([[0.0487, 0.0242, 0.0120,  ..., 0.0225, 0.0057, 0.0103],
        [0.0098, 0.0423, 0.0359,  ..., 0.0166, 0.0179, 0.0354],
        [0.0164, 0.0362, 0.0447,  ..., 0.0002, 0.0369, 0.0400],
        ...,
        [0.0172, 0.0274, 0.0296,  ..., 0.0009, 0.0210, 0.0441],
        [0.0109, 0.0359, 0.0260,  ..., 0.0358, 0.0104, 0.0041],
        [0.0108, 0.0115, 0.0242,  ..., 0.0013, 0.0258, 0.0490]])
movie_factors.weight tensor([[0.0337, 0.0140, 0.0063,  ..., 0.0495, 0.0277, 0.0174],
        [0.0023, 0.0182, 0.0381,  ..., 0.0175, 0.0248, 0.0114],
        [0.0280, 0.0083, 0.0405,  ..., 0.0498, 0.0039, 0.0478],
        ...,
        [0.0043, 0.0435, 0.0383,  ..., 0.0166, 0.0294, 0.0338],
        [0.0330, 0.0232, 0.0454,  ..., 0.0351, 0.0314, 0.0309],
        [0.0171, 0.0410, 0.0056,  ..., 0.0157, 0.0431, 0.0095]])
iter #0 Loss: 11.062081646798227
iter #1 Loss: 4.745890641272975
iter #2 Loss: 2.473601130815932
iter #3 Loss: 1.7215600473626615
iter #4 

In [41]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

user_factors.weight tensor([[ 1.2726e+00,  1.8067e+00,  7.1121e-01,  ...,  6.9756e-01,
          1.4878e+00,  1.6245e+00],
        [ 1.1304e+00,  5.0090e-01,  1.2233e+00,  ...,  1.0290e+00,
          2.9691e-01,  9.3482e-01],
        [ 5.5068e-04, -5.4346e-01,  3.2296e-01,  ..., -1.0895e+00,
          1.5022e+00, -1.4924e+00],
        ...,
        [-2.4351e-01,  4.0455e-01,  2.7986e+00,  ...,  7.8791e-01,
          1.6341e+00,  1.4713e+00],
        [ 1.0527e+00,  1.4847e+00,  1.4288e+00,  ...,  2.2886e-01,
          5.8942e-01,  8.6957e-01],
        [ 9.3817e-01,  7.0492e-01,  1.4768e+00,  ...,  1.4043e+00,
          1.4756e+00, -6.3685e-02]])
movie_factors.weight tensor([[0.8128, 0.4463, 0.2855,  ..., 0.5901, 0.4254, 0.3448],
        [0.5649, 0.3949, 0.2103,  ..., 0.5250, 0.2626, 0.6367],
        [0.6031, 0.1798, 0.3586,  ..., 0.6429, 0.1892, 0.6468],
        ...,
        [0.3847, 0.4246, 0.4158,  ..., 0.3955, 0.4096, 0.4150],
        [0.4275, 0.4190, 0.4432,  ..., 0.4302, 0.4254, 0.4

In [42]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
trained_movie_embeddings = model.movie_factors.weight.data.cpu().numpy()
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.lookup[movidx]
    # print(ratings_df.loc[ratings_df['movieId']==movid].count())
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()["userId"]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Miracle on 34th Street (1994)
	 Go Fish (1994)
	 Some Like It Hot (1959)
	 Piano, The (1993)
	 What's Eating Gilbert Grape (1993)
	 Philadelphia (1993)
	 Queen Margot (Reine Margot, La) (1994)
	 Gigi (1958)
	 Real Genius (1985)
	 Shall We Dance (1937)
Cluster #1
	 Poetic Justice (1993)
	 Bread and Chocolate (Pane e cioccolata) (1973)
	 Kissed (1996)
	 Super Mario Bros. (1993)
	 Unlawful Entry (1992)
	 NeverEnding Story III, The (1994)
	 Jefferson in Paris (1995)
	 Body Shots (1999)
	 Everybody's Famous! (Iedereen beroemd!) (2000)
	 Sunset Park (1996)
Cluster #2
	 How to Make an American Quilt (1995)
	 North (1994)
	 Down to Earth (2001)
	 Three Colors: Blue (Trois couleurs: Bleu) (1993)
	 I'll Do Anything (1994)
	 Speechless (1994)
	 Mary Reilly (1996)
	 Babe, The (1992)
	 Top Hat (1935)
	 Jungle2Jungle (a.k.a. Jungle 2 Jungle) (1997)
Cluster #3
	 Singin' in the Rain (1952)
	 Dangerous Minds (1995)
	 Run Silent Run Deep (1958)
	 Say Anything... (1989)
	 Sleeping Beauty (19