In [23]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [24]:
print('The dimensions of movies dataframe are:', movies_df.shape)
print('\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (62423, 3)

The dimensions of ratings dataframe are: (2721202, 4)


In [25]:
# Mapping movie IDs to movie names
movie_names = movies_df.set_index('movieId')['title'].to_dict()

# Counting the number of unique users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

# Displaying the counts
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users * n_items, 'elements.')

# Computing the sparsity of the rating matrix
print("Number of ratings:", len(ratings_df))
sparsity_percentage = len(ratings_df) / (n_users * n_items) * 100
print("Therefore: ", sparsity_percentage, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("As the number of users and products grow, the matrix elements will increase exponentially.")
print("Storing a full matrix in memory at a global scale would be a challenge.")

# Highlighting the advantage of matrix factorization
print("One advantage here is that matrix factorization can implicitly represent the rating matrix, mitigating the need for all data.")


Number of unique users: 18075
Number of unique movies: 30663
The full rating matrix will have: 554233725 elements.
Number of ratings: 2721202
Therefore:  0.49098455710178945 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
As the number of users and products grow, the matrix elements will increase exponentially.
Storing a full matrix in memory at a global scale would be a challenge.
One advantage here is that matrix factorization can implicitly represent the rating matrix, mitigating the need for all data.


In [26]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # Define user and item embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)  # User embedding lookup table
        self.item_factors = torch.nn.Embedding(n_items, n_factors)  # Item embedding lookup table
        # Initialize embeddings with uniform distribution
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # Perform matrix multiplication
        users, items = data[:, 0], data[:, 1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

    def predict(self, user, item):
        # Predict rating for user-item pair
        return self.forward(torch.LongTensor([user]), torch.LongTensor([item])).item()


In [27]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract unique user and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Map original IDs to continuous indices for users and movies
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Map continuous indices back to original IDs for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # Replace original IDs with continuous indices
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        # Prepare input features and target ratings as tensors
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [28]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(18075, 8)
  (item_factors): Embedding(30663, 8)
)
user_factors.weight tensor([[0.0487, 0.0007, 0.0353,  ..., 0.0188, 0.0205, 0.0047],
        [0.0366, 0.0388, 0.0212,  ..., 0.0157, 0.0060, 0.0118],
        [0.0295, 0.0235, 0.0399,  ..., 0.0394, 0.0193, 0.0289],
        ...,
        [0.0024, 0.0256, 0.0193,  ..., 0.0357, 0.0129, 0.0151],
        [0.0114, 0.0156, 0.0399,  ..., 0.0174, 0.0380, 0.0179],
        [0.0459, 0.0192, 0.0139,  ..., 0.0232, 0.0096, 0.0270]])
item_factors.weight tensor([[0.0328, 0.0331, 0.0295,  ..., 0.0388, 0.0496, 0.0272],
        [0.0247, 0.0373, 0.0181,  ..., 0.0036, 0.0322, 0.0260],
        [0.0174, 0.0071, 0.0104,  ..., 0.0174, 0.0241, 0.0266],
        ...,
        [0.0125, 0.0188, 0.0281,  ..., 0.0259, 0.0002, 0.0097],
        [0.0048, 0.0430, 0.0223,  ..., 0.0498, 0.0157, 0.0479],
        [0.0081, 0.0275, 0.0321,  ..., 0.0496, 0.0291, 0.0370]])


In [29]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 2.803487405506149
iter #1 Loss: 0.9263335081393833
iter #2 Loss: 0.8551078334927334
iter #3 Loss: 0.8264788632152335
iter #4 Loss: 0.8061802979330913
iter #5 Loss: 0.7766145468332918
iter #6 Loss: 0.744028598827118
iter #7 Loss: 0.7160166225661563
iter #8 Loss: 0.6883949904806863
iter #9 Loss: 0.6637531934654723
iter #10 Loss: 0.6440824070870035
iter #11 Loss: 0.6291176713112941
iter #12 Loss: 0.6176762880675095
iter #13 Loss: 0.6087525220281206
iter #14 Loss: 0.6017873790672335
iter #15 Loss: 0.5963226177478218
iter #16 Loss: 0.5919963951996219
iter #17 Loss: 0.5885604104004373
iter #18 Loss: 0.5857339702399407
iter #19 Loss: 0.5833248411442801
iter #20 Loss: 0.581538952633332
iter #21 Loss: 0.5798245703372211
iter #22 Loss: 0.5783745696451007
iter #23 Loss: 0.5772500965873923
iter #24 Loss: 0.5761000486302802
iter #25 Loss: 0.5751755471147688
iter #26 Loss: 0.5743872997397426
iter #27 Loss: 0.5734549330395006
iter #28 Loss: 0.5729341201701482
iter #29 Loss: 0.5723041168

In [30]:
# By training the model, latent factors for movies and users have been tuned.
user_factors = None
item_factors = None

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if user_factors is None:
            user_factors = param.data
        else:
            item_factors = param.data

user_factors.weight tensor([[ 0.0509,  0.0361,  0.5975,  ...,  1.8296,  1.0490,  3.5387],
        [ 1.1142,  1.8561, -1.0454,  ...,  1.1626,  3.3761,  2.5774],
        [ 1.6750,  1.3596,  0.8403,  ...,  2.2892,  2.6331,  2.6432],
        ...,
        [ 1.7657,  1.6105,  0.0097,  ...,  2.4129,  1.2036,  3.9554],
        [ 1.4458,  2.1596, -0.0466,  ...,  1.1596,  3.3012,  0.9978],
        [ 2.4562,  2.4345, -1.1092,  ...,  2.4210,  2.1933,  2.2407]],
       device='cuda:0')
item_factors.weight tensor([[ 0.2802, -0.3090, -0.0220,  ...,  0.5183,  0.2685,  0.6917],
        [ 0.1544, -0.0132,  0.6099,  ...,  0.2218,  0.4675,  0.4947],
        [ 0.0267, -0.0661,  0.6312,  ...,  0.1447,  0.4745,  0.4848],
        ...,
        [ 0.2413,  0.2478,  0.2567,  ...,  0.2547,  0.2291,  0.2387],
        [ 0.2967,  0.3355,  0.3150,  ...,  0.3421,  0.3078,  0.3401],
        [ 0.0628,  0.0821,  0.0868,  ...,  0.1041,  0.0837,  0.0915]],
       device='cuda:0')


In [31]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [32]:
len(trained_movie_embeddings) # unique movie factor weights

30663

In [33]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In this observation, it's apparent that movies within the same cluster exhibit similar genres. It's noteworthy that the algorithm operates without awareness of movie titles, deriving relationships solely from numerical representations of user responses to movie selections.

In [34]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Broken Arrow (1996)
	 X2: X-Men United (2003)
	 Mummy, The (1999)
	 Nutty Professor, The (1996)
	 Mission: Impossible II (2000)
	 Charlie's Angels (2000)
	 Avengers, The (2012)
	 I, Robot (2004)
	 Starship Troopers (1997)
	 Judge Dredd (1995)
Cluster #1
	 Birdcage, The (1996)
	 Sense and Sensibility (1995)
	 Little Miss Sunshine (2006)
	 Juno (2007)
	 Big Fish (2003)
	 What's Eating Gilbert Grape (1993)
	 Dogma (1999)
	 The Butterfly Effect (2004)
	 English Patient, The (1996)
	 10 Things I Hate About You (1999)
Cluster #2
	 Pulp Fiction (1994)
	 Reservoir Dogs (1992)
	 Kill Bill: Vol. 1 (2003)
	 Kill Bill: Vol. 2 (2004)
	 Trainspotting (1996)
	 Donnie Darko (2001)
	 Inglourious Basterds (2009)
	 Requiem for a Dream (2000)
	 Sin City (2005)
	 Lost in Translation (2003)
Cluster #3
	 RoboCop (1987)
	 Searchers, The (1956)
	 Re-Animator (1985)
	 Godzilla (Gojira) (1954)
	 Manhunter (1986)
	 Dead Ringers (1988)
	 Tree of Life, The (2011)
	 Mummy, The (1932)
	 Runaway Train (19