In [1]:
import torch
import pandas
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import numpy
import platform
from pytorch_lightning.callbacks import TQDMProgressBar
from mf_lightning_models import MatrixLightningModel

In [2]:
class RateDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        x = torch.tensor(numpy.append(self.df.user[index] -  1, self.df.movie[index] - 1))
        y = torch.tensor(self.df.rating[index]).to(torch.float32)
        return x, y

    def __len__(self):
        return self.df.shape[0]
    
def get_loss(df, model):
    with torch.no_grad():
        criterion = torch.nn.MSELoss()
        preds = model(torch.tensor(df.user - 1), torch.tensor(df.movie - 1))
        return criterion(preds, torch.tensor(df.rating))

In [5]:
# input dataset like user, movie, rating and transform into matrix of dim (users, movies) for autoencoder
class MFDataset(Dataset):
    def __init__(self, df, nb_users, nb_movies):
        self.nb_users = nb_users
        self.nb_movies = nb_moview
        np_matrix = numpy.zeros(nb_users, nb_movies)
        for index, row in df.iterrows():
            np_matrix[row[0] - 1][row[1] - 1] = row[2]
        self.np_matrix = np_matrix

    def __getitem__(self, index):
        x = torch.tensor(numpy.append(self.df.user[index] -  1, self.df.movie[index] - 1))
        y = torch.tensor(self.df.rating[index]).to(torch.float32)
        return x, y

    def __len__(self):
        return self.nb_users

In [6]:
COLS = ['user', 'movie', 'rating', 'timestamp']
# df_train = pandas.read_csv("./data/ml-100k/u1.base", sep='\t', names=COLS).drop(columns=['timestamp']).astype(int)
# df_test = pandas.read_csv("./data/ml-100k/u1.test", sep='\t', names=COLS).drop(columns=['timestamp']).astype(int)
df_1m = pandas.read_csv("./data/ml-1m/ratings.dat", sep='::', names=COLS, engine='python').drop(columns=['timestamp']).astype(int)
nb_users = max(df_1m[:,0])
nb_movies = max(df_1m[:,1])

InvalidIndexError: (slice(None, None, None), 0)

In [4]:
df_train, df_test = train_test_split(df_1m, test_size=0.2, random_state=42, shuffle=True)
df_train = df_train.reset_index()
df_test = df_test.reset_index()
train_data = RateDataset(df_train)
test_data = RateDataset(df_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
validation_loader = DataLoader(test_data, batch_size=32, shuffle=True)
print(df_train.shape, df_test.shape)

(800167, 4) (200042, 4)


In [5]:
params = {'num_users': df_1m.user.max(), 'num_items': df_1m.movie.max(), 'global_mean': df_1m.rating.mean(), 'latent_dim': 0}
print (torch.__version__, torch.backends.mps.is_available(),platform.processor()) 
trainer = pl.Trainer(max_epochs=5, callbacks=[TQDMProgressBar(refresh_rate=1000)]) 
# , accelerator='mps', devices=1)
model = MatrixLightningModel(params)
trainer.fit(model, train_dataloaders=train_loader,val_dataloaders=validation_loader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name           | Type      | Params
---------------------------------------------
0 | loss_fn        | MSELoss   | 0     
1 | user_embedding | Embedding | 0     
2 | item_embedding | Embedding | 0     
3 | user_bias      | Embedding | 6.0 K 
4 | item_bias      | Embedding | 4.0 K 
---------------------------------------------
10.0 K    Trainable params
0         Non-trainable params
10.0 K    Total params
0.040     Total estimated model params size (MB)


1.13.1 True arm


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [None]:
loader_iterator = enumerate(train_loader)
first_batch = next(loader_iterator)
print(first_batch[1][0][:,0])
print(first_batch[1][0][:,1])
print(first_batch)

In [42]:
df_history

Unnamed: 0,latent_dim,train_loss,test_loss
800167,100,tensor(0.4463),tensor(4.1441)


In [21]:
print(get_loss(df_train, model))
print(get_loss(df_test, model))

tensor(0.7937)
tensor(0.9582)


In [45]:
with torch.no_grad():
    print(model(torch.tensor([65]),torch.tensor([29])))
    print(model.user_bias.weight[65] + model.item_bias.weight[29] + model.mu)

tensor([4.1378])
tensor([4.1378])


In [44]:
model

BiasMF(
  (user_embedding): Embedding(943, 0)
  (item_embedding): Embedding(1682, 0)
  (user_bias): Embedding(943, 1)
  (item_bias): Embedding(1682, 1)
)

In [46]:
params

{'num_users': 943, 'num_items': 1682, 'global_mean': 3.52835, 'latent_dim': 0}