In [None]:
# default_exp datasets.bases.sequential

# Sequential Base Dataset
> Implementation of sequential base dataset modules in Pytorch Lightning.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable

import random
import os
import numpy as np
import pandas as pd
import pickle

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

from pytorch_lightning import LightningDataModule

from recohut.utils.common_utils import *
from recohut.datasets.bases.common import Dataset as BaseDataset

from recohut.utils.grouping import create_user_sequences
from recohut.utils.matrix import generate_rating_matrix
from recohut.utils.negative_sampling import random_neg_sample

## SequentialDataset

In [None]:
#export
class SequentialDataset(Dataset, BaseDataset):
    def __init__(self,
                 data_dir,
                 data_type='train',
                 history_size=8,
                 step_size=1,
                 seed=42,
                 mask=1,
                 *args,
                 **kwargs):
        """
        Args:
            data_dir: Where to save/load the data
            data_type: train/valid/test
        """
        self.data_type = data_type
        self.history_size = history_size
        self.step_size = step_size
        self.seed = seed
        self.mask = mask

        super().__init__(data_dir)

        self._process()

    @property
    def raw_file_names(self):
        raise NotImplementedError

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        raise NotImplementedError

    def load_ratings_df(self):
        raise NotImplementedError

    def map_column(self, df: pd.DataFrame, col_name: str):
        """Maps column values to integers.
        """
        values = sorted(list(df[col_name].unique()))
        mapping = {k: i + 2 for i, k in enumerate(values)}
        inverse_mapping = {v: k for k, v in mapping.items()}
        df[col_name + "_mapped"] = df[col_name].map(mapping)
        return df, mapping, inverse_mapping

    def get_context(self, df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 5, seed: int = 42):
        """Create a training / validation samples.
        """
        random.seed(seed)
        if split == "train":
            end_index = random.randint(10, df.shape[0] - val_context_size)
        elif split in ["valid", "test"]:
            end_index = df.shape[0]
        else:
            raise ValueError
        start_index = max(0, end_index - context_size)
        context = df[start_index:end_index]
        return context

    def pad_list(self, list_integers, history_size: int, pad_val: int = 0, mode="left"):
        """Pad list from left or right
        """
        if len(list_integers) < history_size:
            if mode == "left":
                list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
            else:
                list_integers = list_integers + [pad_val] * (history_size - len(list_integers))
        return list_integers

    def mask_list(self, l1, p=0.8):
        random.seed(self.seed)
        l1 = [a if random.random() < p else self.mask for a in l1]
        return l1

    def mask_last_elements_list(self, l1, val_context_size: int = 5):
        l1 = l1[:-val_context_size] + self.mask_list(l1[-val_context_size:], p=0.5)
        return l1

    def make_user_history(self, data):
        user_history = [ [] for _ in range(self.num_users) ]
        for u, i, r in data: user_history[u].append(i)
        return user_history

    # def pad(self, arr, max_len = None, pad_with = -1, side = 'right'):
    #     seq_len = max_len if max_len is not None else max(map(len, arr))
    #     seq_len = min(seq_len, 200) # You don't need more than this

    #     for i in range(len(arr)):
    #         while len(arr[i]) < seq_len: 
    #             pad_elem = arr[i][-1] if len(arr[i]) > 0 else 0
    #             pad_elem = pad_elem if pad_with == -1 else pad_with
    #             if side == 'right': arr[i].append(pad_elem)
    #             else: arr[i] = [ pad_elem ] + arr[i]
    #         arr[i] = arr[i][-seq_len:] # Keep last `seq_len` items
    #     return arr

    # def sequential_pad(self, arr, max_seq_len, total_items):
    #     # Padding left side so that we can simply take out [:, -1, :] in the output
    #     return self.pad(
    #         arr, max_len = max_seq_len, 
    #         pad_with = total_items, side = 'left'
    #     )

    # def scatter(self, batch, tensor_kind, last_dimension):
    #     ret = tensor_kind(len(batch), last_dimension).zero_()

    #     if not torch.is_tensor(batch):
    #         if ret.is_cuda: batch = torch.cuda.LongTensor(batch)
    #         else: batch = torch.LongTensor(batch)

    #     return ret.scatter_(1, batch, 1)

    # def get_item_count_map(self, data):
    #     item_count = defaultdict(int)
    #     for u, i, r in data: item_count[i] += 1
    #     return item_count

    # def get_item_propensity(self, data, num_items, A = 0.55, B = 1.5):
    #     item_freq_map = self.get_item_count_map()
    #     item_freq = [ item_freq_map[i] for i in range(num_items) ]
    #     num_instances = len(data)

    #     C = (np.log(num_instances)-1)*np.power(B+1, A)
    #     wts = 1.0 + C*np.power(np.array(item_freq)+B, -A)
    #     return np.ravel(wts)

    def create_sequences(self, values, window_size, step_size):
        sequences = []
        start_index = 0
        while True:
            end_index = start_index + window_size
            seq = values[start_index:end_index]
            if len(seq) < window_size:
                seq = values[-window_size:]
                if len(seq) == window_size:
                    sequences.append(seq)
                break
            sequences.append(seq)
            start_index += step_size
        return sequences

    def process(self):
        df = self.load_ratings_df()
        df.sort_values(by="timestamp", inplace=True)
        df, self.mapping, self.inverse_mapping = self.map_column(df, col_name="sid")
        self.grp_by = df.groupby(by="uid")
        self.groups = list(self.grp_by.groups)

    def __len__(self):
            return len(self.groups)

    def __getitem__(self, index):
        group = self.groups[index]
        df = self.grp_by.get_group(group)
        context = self.get_context(df, split=self.data_type, context_size=self.history_size)
        trg_items = context["sid_mapped"].tolist()
        if self.data_type == "train":
            src_items = self.mask_list(trg_items)
        else:
            src_items = self.mask_last_elements_list(trg_items)
        pad_mode = "left" if random.random() < 0.5 else "right"
        trg_items = self.pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = self.pad_list(src_items, history_size=self.history_size, mode=pad_mode)
        src_items = torch.tensor(src_items, dtype=torch.long)
        trg_items = torch.tensor(trg_items, dtype=torch.long)
        return src_items, trg_items

## SequentialDataModule

In [None]:
#export
class SequentialDataModule(LightningDataModule):

    dataset_cls: str = ""

    def __init__(self,
                 data_dir: Optional[str] = None,
                 num_workers: int = 0,
                 normalize: bool = False,
                 batch_size: int = 32,
                 shuffle: bool = True,
                 pin_memory: bool = True,
                 drop_last: bool = False,
                 *args, 
                 **kwargs) -> None:
        """
        Args:
            data_dir: Where to save/load the data
            num_workers: How many workers to use for loading data
            normalize: If true applies rating normalize
            batch_size: How many samples per batch to load
            shuffle: If true shuffles the train data every epoch
            pin_memory: If true, the data loader will copy Tensors into CUDA pinned memory before
                        returning them
            drop_last: If true drops the last incomplete batch
        """
        super().__init__(data_dir)

        self.data_dir = data_dir if data_dir is not None else os.getcwd()
        self.num_workers = num_workers
        self.normalize = normalize
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pin_memory = pin_memory
        self.drop_last = drop_last
        self.kwargs = kwargs

    def prepare_data(self, *args: Any, **kwargs: Any) -> None:
        """Saves files to data_dir."""
        self.data = self.dataset_cls(self.data_dir, **self.kwargs)

    def setup(self, stage: Optional[str] = None) -> None:
        """Creates train, val, and test dataset."""
        if stage == "fit" or stage is None:
            self.dataset_train = self.dataset_cls(self.data_dir, data_type='train', **self.kwargs)
            self.dataset_val = self.dataset_cls(self.data_dir, data_type='valid', **self.kwargs)
        if stage == "test" or stage is None:
            self.dataset_test = self.dataset_cls(self.data_dir, data_type='test', **self.kwargs)

    def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
        """The train dataloader."""
        return self._data_loader(self.dataset_train, shuffle=self.shuffle)

    def val_dataloader(self, *args: Any, **kwargs: Any) -> Union[DataLoader, List[DataLoader]]:
        """The val dataloader."""
        return self._data_loader(self.dataset_val)

    def test_dataloader(self, *args: Any, **kwargs: Any) -> Union[DataLoader, List[DataLoader]]:
        """The test dataloader."""
        return self._data_loader(self.dataset_test)

    def _data_loader(self, dataset: Dataset, shuffle: bool = False) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            drop_last=self.drop_last,
            pin_memory=self.pin_memory,
        )

Example

In [None]:
class ML1mDataset(SequentialDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        return df

In [None]:
class ML1mDataModule(SequentialDataModule):
    dataset_cls = ML1mDataset

In [None]:
class Args:
    def __init__(self):
        self.pad = 0
        self.mask = 1
        self.cap = 0
        self.seed = 42
        self.vocab_size = 10000
        self.channels = 128
        self.dropout = 0.4
        self.learning_rate = 1e-4
        self.history_size = 30
        self.data_dir = '/content/data'
        self.log_dir = '/content/recommender_logs'
        self.model_dir = '/content/recommender_models'
        self.batch_size = 32
        self.shuffle = True
        self.max_epochs = 2
        self.val_epoch = 1
        self.gpus = None
        self.monitor = 'valid_loss'
        self.mode = 'min'

args = Args()

In [None]:
ds = ML1mDataModule(data_sir=args.data_dir, **args.__dict__)
ds.prepare_data()

## SASRecDataset

In [None]:
#export
class SASRecDataset(Dataset, BaseDataset):
    
    def __init__(self,
                 data_dir,
                 data_type='train',
                 test_neg_items=None,
                 min_seq_length=10,
                 max_seq_length=50,
                 sample_frac=None,
                 *args,
                 **kwargs):
        """
        Args:
            data_dir: Where to save/load the data
            data_type: train/valid/test
        """
        self.data_type = data_type
        self.test_neg_items = test_neg_items
        self.min_len = min_seq_length
        self.max_len = max_seq_length
        self.sample_frac = sample_frac

        assert self.data_type in {"train", "valid", "test"}

        super().__init__(data_dir)

        self._process()
        self.load_processed()

        self.item_size = self.max_item + 2

    @property
    def raw_file_names(self):
        raise NotImplementedError

    @property
    def processed_file_names(self):
        return ['user_seqs.txt', 'user_seqs.pkl']

    def download(self):
        raise NotImplementedError

    def load_ratings_df(self):
        raise NotImplementedError

    def get_user_seqs(self):
        lines = open(self.processed_paths[0]).readlines()
        user_seq = []
        item_set = set()
        for line in lines:
            user, items = line.strip().split(' ', 1)
            items = items.split(' ')
            items = [int(item) for item in items]
            user_seq.append(items)
            item_set = item_set | set(items)
        max_item = max(item_set)

        num_users = len(lines)
        num_items = max_item + 2

        valid_rating_matrix = generate_rating_matrix(user_seq, num_users, num_items, n=2)
        test_rating_matrix = generate_rating_matrix(user_seq, num_users, num_items, n=1)
        output = {
            'user_seq': user_seq,
            'max_item': max_item,
            'valid_rating_matrix': valid_rating_matrix,
            'test_rating_matrix': test_rating_matrix,
            'num_users': num_users,
        }
        return output

    def process(self):
        if not os.path.exists(self.processed_paths[0]):
            df = self.load_ratings_df()
            # random sample
            if self.sample_frac is not None:
                df = df.sample(frac=self.sample_frac)
            # filter based on item sequence length for user
            seq_len = df.groupby('uid').size()
            df = df[np.in1d(df.uid, seq_len[seq_len >= self.min_len].index)]
            create_user_sequences(df, save_path=self.processed_paths[0],
                                  user_col='uid', item_col='sid', ts_col='timestamp')
        user_seqs = self.get_user_seqs()
        with open(self.processed_paths[1], 'wb') as f:
            pickle.dump(user_seqs, f)

    def load_processed(self):
        processed_data = pickle.load(open(self.processed_paths[1], 'rb'))
        self.user_seq = processed_data['user_seq']
        self.max_item = processed_data['max_item']
        self.valid_rating_matrix = processed_data['valid_rating_matrix']
        self.test_rating_matrix = processed_data['test_rating_matrix']
        self.num_users = processed_data['num_users']
        del processed_data

    def __getitem__(self, index):
        user_id = index
        items = self.user_seq[index]

        # [0, 1, 2, 3, 4, 5, 6]
        # train [0, 1, 2, 3]
        # target [1, 2, 3, 4]

        # valid [0, 1, 2, 3, 4]
        # answer [5]

        # test [0, 1, 2, 3, 4, 5]
        # answer [6]
        if self.data_type == "train":
            input_ids = items[:-3]
            target_pos = items[1:-2]
            answer = [0] # no use

        elif self.data_type == 'valid':
            input_ids = items[:-2]
            target_pos = items[1:-1]
            answer = [items[-2]]

        else:
            input_ids = items[:-1]
            target_pos = items[1:]
            answer = [items[-1]]

        target_neg = []
        seq_set = set(items)
        for _ in input_ids:
            target_neg.append(random_neg_sample(seq_set, self.item_size))

        pad_len = self.max_len - len(input_ids)
        input_ids = [0] * pad_len + input_ids
        target_pos = [0] * pad_len + target_pos
        target_neg = [0] * pad_len + target_neg

        input_ids = input_ids[-self.max_len:]
        target_pos = target_pos[-self.max_len:]
        target_neg = target_neg[-self.max_len:]

        assert len(input_ids) == self.max_len
        assert len(target_pos) == self.max_len
        assert len(target_neg) == self.max_len

        if self.test_neg_items is not None:
            test_samples = self.test_neg_items[index]

            cur_tensors = (
                torch.tensor(user_id, dtype=torch.long), # user_id for testing
                torch.tensor(input_ids, dtype=torch.long),
                torch.tensor(target_pos, dtype=torch.long),
                torch.tensor(target_neg, dtype=torch.long),
                torch.tensor(answer, dtype=torch.long),
                torch.tensor(test_samples, dtype=torch.long),
            )
        else:
            cur_tensors = (
                torch.tensor(user_id, dtype=torch.long),  # user_id for testing
                torch.tensor(input_ids, dtype=torch.long),
                torch.tensor(target_pos, dtype=torch.long),
                torch.tensor(target_neg, dtype=torch.long),
                torch.tensor(answer, dtype=torch.long),
            )

        return cur_tensors

    def __len__(self):
        return len(self.user_seq)

## SASRecDataModule

In [None]:
#export
class SASRecDataModule(LightningDataModule):

    dataset_cls = None

    def __init__(self,
                 data_dir: Optional[str] = None,
                 num_workers: int = 0,
                 batch_size: int = 32,
                 pin_memory: bool = True,
                 drop_last: bool = False,
                 *args, 
                 **kwargs) -> None:
        """
        Args:
            data_dir: Where to save/load the data
            num_workers: How many workers to use for loading data
            batch_size: How many samples per batch to load
            pin_memory: If true, the data loader will copy Tensors into CUDA pinned memory before
                        returning them
            drop_last: If true drops the last incomplete batch
        """
        super().__init__(data_dir)

        self.data_dir = data_dir if data_dir is not None else os.getcwd()
        self.num_workers = num_workers
        self.batch_size = batch_size
        self.pin_memory = pin_memory
        self.drop_last = drop_last
        self.kwargs = kwargs

    def prepare_data(self, *args: Any, **kwargs: Any) -> None:
        """Saves files to data_dir."""
        self.data = self.dataset_cls(self.data_dir, **self.kwargs)

    def setup(self, stage: Optional[str] = None) -> None:
        """Creates train, val, and test dataset."""
        if stage == "fit" or stage is None:
            self.dataset_train = self.dataset_cls(self.data_dir, data_type='train', **self.kwargs)
            self.dataset_val = self.dataset_cls(self.data_dir, data_type='valid', **self.kwargs)
        if stage == "test" or stage is None:
            self.dataset_test = self.dataset_cls(self.data_dir, data_type='test', **self.kwargs)

    def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
        """The train dataloader."""
        train_sampler = RandomSampler(self.dataset_train)
        return self._data_loader(self.dataset_train, sampler=train_sampler)

    def val_dataloader(self, *args: Any, **kwargs: Any) -> Union[DataLoader, List[DataLoader]]:
        """The val dataloader."""
        val_sampler = SequentialSampler(self.dataset_val)
        return self._data_loader(self.dataset_val, sampler=val_sampler)

    def test_dataloader(self, *args: Any, **kwargs: Any) -> Union[DataLoader, List[DataLoader]]:
        """The test dataloader."""
        test_sampler = SequentialSampler(self.dataset_test)
        return self._data_loader(self.dataset_test, sampler=test_sampler)

    def _data_loader(self, dataset: Dataset, sampler) -> DataLoader:
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=False,
            sampler=sampler,
            num_workers=self.num_workers,
            drop_last=self.drop_last,
            pin_memory=self.pin_memory,
        )

Example

In [None]:
class AmazonBeautyDataset(SASRecDataset):

    url = 'https://github.com/RecoHut-Datasets/amazon_beauty/raw/v1/amazon-ratings.zip'

    @property
    def raw_file_names(self):
        return 'ratings_Beauty.csv'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0])
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping latest rating only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df

In [None]:
class AmazonBeautyDataModule(SASRecDataModule):

    dataset_cls = AmazonBeautyDataset

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_len = 10
        self.max_len = 50
        self.sample_frac = 0.2
        self.num_workers = 2
        self.batch_size = 32

args = Args()

In [None]:
dm = AmazonBeautyDataModule(**args.__dict__)
dm.prepare_data()
dm.setup(stage='fit')

In [None]:
for batch in dm.train_dataloader():
    print(batch)
    break

[tensor([322, 157,  54, 274, 489, 415, 258, 246, 155, 116, 119, 408,  22, 287,
        421, 280, 292, 146,  14, 432,  62, 396, 302, 495, 331, 271, 370, 303,
        201, 266, 241,   1]), tensor([[   0,    0,    0,  ...,  169, 4647, 3192],
        [   0,    0,    0,  ...,   23, 4221, 1286],
        [   0,    0,    0,  ..., 4683, 4629, 4307],
        ...,
        [   0,    0,    0,  ..., 3640, 1356, 1572],
        [   0,    0,    0,  ..., 2436, 3059, 1251],
        [   0,    0,    0,  ..., 2057, 3723, 4220]]), tensor([[   0,    0,    0,  ..., 4647, 3192, 2818],
        [   0,    0,    0,  ..., 4221, 1286, 3835],
        [   0,    0,    0,  ..., 4629, 4307,  445],
        ...,
        [   0,    0,    0,  ..., 1356, 1572, 2992],
        [   0,    0,    0,  ..., 3059, 1251, 2169],
        [   0,    0,    0,  ..., 3723, 4220, 1569]]), tensor([[   0,    0,    0,  ...,  282, 5128,  622],
        [   0,    0,    0,  ..., 4801,  725,  381],
        [   0,    0,    0,  ...,  682, 2373,  345],
   

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-22 12:13:29

recohut: 0.0.11

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

torch  : 1.10.0+cu111
numpy  : 1.19.5
pandas : 1.1.5
IPython: 5.5.0

