In [7]:
from torch.utils.data import Dataset, DataLoader
import zipfile
import pandas as pd
import pdb;
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

class MovieDataset(Dataset):
  def __init__(self):
    ! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

    with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip:
      zip.extractall('data')

    movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
    ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

    users = ratings_df.userId.unique()
    movies = ratings_df.movieId.unique()

    self.d = defaultdict(LabelEncoder)

    for c in ['userId', 'movieId']:
      self.d[c].fit(ratings_df[c].unique())
      ratings_df[c] = self.d[c].transform(ratings_df[c])

    self.x = ratings_df.drop(['rating', 'timestamp'], axis=1).values
    self.y = ratings_df['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    self.n_users = len(users)
    self.n_items = len(movies)

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return len(self.x)

In [14]:
import torch
import numpy as np

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=20):
    super().__init__()
    self.user_factors = torch.nn.Embedding(n_users, n_factors)
    self.item_factors = torch.nn.Embedding(n_items, n_factors)
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):


    users, items = data[:,0], data[:, 1]
    ret =  (self.user_factors(users) * self.item_factors(items)).sum(dim=1)



    return ret

  def predict(self, user, item):
    data = torch.tensor([[user, item]], dtype=torch.long)
    return self.forward(data)

In [20]:
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split





train_set = MovieDataset()

# Split the dataset into training and testing sets
train_size = int(0.8 * len(train_set))
test_size = len(train_set) - train_size
train_dataset, test_dataset = random_split(train_set, [train_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Define the model, loss function, and optimizer
epochs = 128
loss_fn = torch.nn.MSELoss()
model = MatrixFactorization(train_set.n_users, train_set.n_items, n_factors=8)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)



# Training loop
for epoch in range(epochs):
    model.train()
    for x, y in train_loader:
        # Get input and target tensors from the batch


        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs, y.type(torch.float32))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print the training loss for this epoch
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluation function
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:


            ratings = y.float()

            predictions = model(x)
            loss = loss_fn(predictions, ratings)
            total_loss += loss.item()

    return total_loss / len(loader)

# Evaluate on the test set
test_loss = evaluate(model, test_loader)
print(f"Test Loss: {test_loss}")




  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  3901k      0 --:--:-- --:--:-- --:--:-- 3915k
Epoch 1/128, Loss: 10.309615135192871
Epoch 2/128, Loss: 3.1159708499908447
Epoch 3/128, Loss: 2.0966362953186035
Epoch 4/128, Loss: 2.0945687294006348
Epoch 5/128, Loss: 0.6799866557121277
Epoch 6/128, Loss: 1.1428688764572144
Epoch 7/128, Loss: 1.008757472038269
Epoch 8/128, Loss: 1.245184302330017
Epoch 9/128, Loss: 0.9044726490974426
Epoch 10/128, Loss: 0.7922093272209167
Epoch 11/128, Loss: 1.0418025255203247
Epoch 12/128, Loss: 0.7116469144821167
Epoch 13/128, Loss: 0.6128061413764954
Epoch 14/128, Loss: 0.5410670042037964
Epoch 15/128, Loss: 0.45970234274864197
Epoch 16/128, Loss: 0.4757651388645172
Epoch 17/128, Loss: 0.7680797576904297
Epoch 18/128, Loss: 0.9019783735275269
Epoch 19/128, Loss: 0.7014150619506836
Epoch 20/128, Loss: 0.5203089714

In [5]:
cuda = torch.cuda.is_available()
print(cuda)

True


In [10]:
from tqdm import tqdm_notebook as tqdm

for it in tqdm(range(epochs)):
 # losses = []
  for x, y in train_loader:
    optimizer.zero_grad()
    outputs = model(x)
    loss = loss_fn(outputs, y.type(torch.float32))
    #losses.append(loss.item())


    loss.backward()
    optimizer.step()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

In [28]:



for movieId in [111, 172, 181, 208,223,255,260]:
  userId = 66
  movieIndex = train_set.d['movieId'].transform([movieId])[0]
  userIndex = train_set.d['userId'].transform([userId])[0]
  predicted_rating = model.predict(userId, movieIndex)
  print(f"Predicted rating for user {userId} and item {movieId}: {predicted_rating}")

Predicted rating for user 66 and item 111: tensor([3.0638], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 172: tensor([2.6197], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 181: tensor([2.1343], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 208: tensor([2.8477], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 223: tensor([3.7402], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 255: tensor([1.3894], grad_fn=<SumBackward1>)
Predicted rating for user 66 and item 260: tensor([4.4362], grad_fn=<SumBackward1>)
