In [None]:
# default_exp models.mf

In [None]:
#hide
!pip install pytorch-lightning
!git clone --branch US632593 https://github.com/RecoHut-Projects/recohut.git
%cd recohut
!pip install -U .
!apt-get -qq install tree
!pip install -q watermark

# MF
> Implementation of Matrix Factorization model in PyTorch Lightning.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable
import os

import torch
from torch import nn
from torch.nn import functional as F

from recohut.models.bases.common import PointModel

In [None]:
#export
class MF(PairModel):
    """A matrix factorization model trained using SGD and negative sampling."""

    def __init__(self, n_users, n_items, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(
            num_embeddings=n_users, embedding_dim=embedding_dim
        )
        self.item_embedding = nn.Embedding(
            num_embeddings=n_items, embedding_dim=embedding_dim
        )
        self.user_bias = nn.Parameter(torch.zeros((n_users)))
        self.item_bias = nn.Parameter(torch.zeros((n_items)))
        self.bias = nn.Parameter(torch.Tensor([0]))

    def forward(self, users, items):
        return (
                self.bias +
                self.user_bias[users] +
                self.item_bias[items] +
                (self.user_embedding(users).mul(self.item_embedding(items))).sum(dim=-1)
        )

Dataset

In [None]:
import numpy as np
import pandas as pd

from recohut.utils.common_utils import *
from recohut.datasets.bases.interactions import InteractionsDataset, InteractionsDataModule

import warnings
warnings.filterwarnings('ignore')


class ML1mDataset(InteractionsDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping recent ratings only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df


class ML1mDataModule(InteractionsDataModule):
    dataset_cls = ML1mDataset

Trainer

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_rating = 4
        self.num_negative_samples = 99
        self.min_uc = 5
        self.min_sc = 5

        self.log_dir = '/content/logs'
        self.model_dir = '/content/models'

        self.val_p = 0.2
        self.test_p = 0.2
        self.num_workers = 2
        self.normalize = False
        self.batch_size = 32
        self.seed = 42
        self.shuffle = True
        self.pin_memory = True
        self.drop_last = False
        self.split_type = 'stratified'

        self.embedding_dim = 20
        self.max_epochs = 5

args = Args()

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

ds = ML1mDataModule(**args.__dict__)

logger = TensorBoardLogger(
    save_dir=args.log_dir,
)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath=args.model_dir,
    filename="recommender",
)

def pl_trainer(model, datamodule):

    trainer = Trainer(
    max_epochs=args.max_epochs,
    logger=logger,
    check_val_every_n_epoch=10,
    callbacks=[checkpoint_callback],
    gpus=None
    )

    trainer.fit(model, datamodule=datamodule)
    test_result = trainer.test(model, datamodule=datamodule)
    return test_result

In [None]:
ds.prepare_data()

Downloading http://files.grouplens.org/datasets/movielens/ml-1m.zip
Extracting /content/data/raw/ml-1m.zip
Processing...


Turning into implicit ratings
Filtering triplets
Densifying index


Done!


In [None]:
model = MF(n_items=ds.data.num_items, n_users=ds.data.num_users, embedding_dim=args.embedding_dim)

pl_trainer(model, ds)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: /content/logs/default

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 120 K 
1 | item_embedding | Embedding | 62.5 K
---------------------------------------------
192 K     Trainable params
0         Non-trainable params
192 K     Total params
0.769     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test Metrics': {'apak': tensor(0.0213),
                  'hr': tensor(0.0810),
                  'loss': tensor(0.9917),
                  'ncdg': tensor(0.0349)}}
--------------------------------------------------------------------------------


[{'Test Metrics': {'apak': tensor(0.0213),
   'hr': tensor(0.0810),
   'loss': tensor(0.9917),
   'ncdg': tensor(0.0349)}}]

In [None]:
# !apt-get -qq install tree
!tree -h --du -C "{args.data_dir}"

[01;34m/content/data[00m
├── [ 11M]  [01;34mprocessed[00m
│   ├── [2.3M]  data_test_neg.pt
│   ├── [ 95K]  data_test_pos.pt
│   ├── [6.5M]  data_train.pt
│   ├── [2.3M]  data_valid_neg.pt
│   └── [ 95K]  data_valid_pos.pt
└── [ 23M]  [01;34mraw[00m
    └── [ 23M]  ratings.dat

  35M used in 2 directories, 6 files


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut,pytorch_lightning

Author: Sparsh A.

Last updated: 2022-01-10 09:09:05

recohut          : 0.0.10
pytorch_lightning: 1.5.8

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy  : 1.19.5
pandas : 1.1.5
torch  : 1.10.0+cu111
IPython: 5.5.0

