In [1]:
import pandas as pd
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [2]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [3]:
movies_df.head()    

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [6]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
       
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Define the MatrixFactorization model
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_items, n_factors=8):
        super(MatrixFactorization, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        user_factors = self.user_factors(user)
        item_factors = self.item_factors(item)
        return (user_factors * item_factors).sum(1)

# Assuming n_users and n_items are defined elsewhere in your code
n_users = 1000  # Example number of users, replace with actual number
n_items = 1000  # Example number of items, replace with actual number

num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        users, items, ratings = batch
        if cuda:
            users = users.cuda()
            items = items.cuda()
            ratings = ratings.cuda()

        optimizer.zero_grad()
        outputs = model(users, items)
        loss = loss_fn(outputs, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")


Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(1000, 8, sparse=True)
  (item_factors): Embedding(1000, 8, sparse=True)
)
user_factors.weight tensor([[-1.4160, -1.1159, -0.1101,  ..., -0.5719,  0.5045, -0.2853],
        [ 0.2546, -0.2217, -0.0050,  ..., -0.4236,  1.6753, -0.4730],
        [ 1.1924, -0.2979,  0.9626,  ...,  1.3267,  0.3650, -1.0674],
        ...,
        [-1.8009, -0.0312, -1.5954,  ..., -0.4758,  0.3751, -1.2936],
        [ 1.2020,  0.0313,  0.2394,  ..., -0.2082,  0.1170, -0.3914],
        [-3.4803, -0.3363,  0.3832,  ..., -0.7835, -0.8711,  0.0221]])
item_factors.weight tensor([[ 0.5070,  0.3652, -1.5157,  ...,  0.3250, -1.7843, -0.6380],
        [ 0.5755,  1.1181, -0.4449,  ...,  1.3206, -1.2557,  1.0866],
        [-0.5849, -0.5160,  1.5197,  ...,  0.4648,  1.0287,  0.0358],
        ...,
        [-1.7064, -0.4485,  0.3200,  ...,  1.3188, -0.9324,  1.8111],
        [-0.7862, -1.2211,  0.6928,  ..., -0.1242,  0.8121,  0.1035],
        [ 0.734

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
from tqdm import tqdm

for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  0%|          | 0/128 [00:00<?, ?it/s]


TypeError: MatrixFactorization.forward() missing 1 required positional argument: 'item'

In [None]:
c = 0
uw = 0
iw = 0 
for name, yash in model.named_parameters():
    if yash.requires_grad:
        print(name, yash.data)
        if c == 0:
          uw = yash.data
          c +=1
        else:
          iw = yash.data

user_factors.weight tensor([[ 0.6566,  1.5865,  1.4255,  ...,  1.6873,  0.7072,  1.8206],
        [ 2.3735,  0.6803,  1.4607,  ...,  1.3688,  0.4064,  1.0991],
        [ 1.9848,  0.8896, -3.5604,  ...,  0.9796, -0.3901, -0.2799],
        ...,
        [ 0.0275,  1.6872,  1.4219,  ...,  1.6841,  1.5586, -1.1896],
        [ 1.0926,  0.8030,  1.4236,  ...,  1.0499,  1.4311,  0.8676],
        [ 1.8563,  0.9356,  0.3273,  ...,  1.0134,  1.6467,  1.3681]])
item_factors.weight tensor([[ 0.6710,  0.1434,  0.5828,  ...,  0.1939,  0.5384,  0.6981],
        [ 0.4802,  0.4188,  0.6303,  ...,  0.3990, -0.0062,  0.5881],
        [ 0.3190,  0.4571,  0.5744,  ...,  0.5104,  0.7044,  0.0860],
        ...,
        [ 0.3506,  0.3344,  0.3727,  ...,  0.3402,  0.3474,  0.3565],
        [ 0.4232,  0.4131,  0.4125,  ...,  0.4102,  0.4085,  0.4064],
        [ 0.4045,  0.4208,  0.4298,  ...,  0.4318,  0.3967,  0.4107]])


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) 

9724

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
import numpy as np


for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    for movidx in np.nonzero(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        rat_count = ratings_df.loc[ratings_df['movieId']==movid].iloc[:, 0].count()
        movs.append((movie_names[movid], rat_count))
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])
     

Cluster #0
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Fight Club (1999)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Fugitive, The (1993)
Cluster #1
	 Pulp Fiction (1994)
	 American Beauty (1999)
	 Seven (a.k.a. Se7en) (1995)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Godfather, The (1972)
	 Ace Ventura: Pet Detective (1994)
	 Memento (2000)
	 Monty Python and the Holy Grail (1975)
	 Reservoir Dogs (1992)
	 Kill Bill: Vol. 1 (2003)
Cluster #2
	 Dances with Wolves (1990)
	 Stargate (1994)
	 Home Alone (1990)
	 Waterworld (1995)
	 Net, The (1995)
	 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
	 Back to the Future Part III (1990)
	 Unbreakable (2000)
	 Pirates of the Caribbean: Dead Man