In [None]:
# default_exp datasets.amazon_beauty

# Amazon Beauty
> Amazon Beauty Dataset.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable

import os
import pandas as pd

from recohut.utils.common_utils import *

from recohut.datasets.bases.sequential import SASRecDataset, SASRecDataModule

In [None]:
#export
class AmazonBeautyDataset(SASRecDataset):

    url = 'https://github.com/RecoHut-Datasets/amazon_beauty/raw/v1/amazon-ratings.zip'

    @property
    def raw_file_names(self):
        return 'ratings_Beauty.csv'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0])
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping latest rating only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df

In [None]:
#export
class AmazonBeautyDataModule(SASRecDataModule):

    dataset_cls = AmazonBeautyDataset

Example

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_len = 10
        self.max_len = 50
        self.sample_frac = 0.2
        self.num_workers = 2
        self.batch_size = 32

args = Args()

In [None]:
dm = AmazonBeautyDataModule(**args.__dict__)
dm.prepare_data()
dm.setup(stage='fit')

Downloading https://github.com/RecoHut-Datasets/amazon_beauty/raw/v1/amazon-ratings.zip
Extracting /content/data/raw/amazon-ratings.zip
Processing...
Done!


In [None]:
for batch in dm.train_dataloader():
    print(batch)
    break

[tensor([394, 421, 256, 119, 320, 434,  30, 431,  54,  67, 415, 277, 254, 321,
        290,  95, 206, 275, 391, 463, 323, 105, 399, 288,  14, 120, 147,  47,
         82, 437, 412, 184]), tensor([[   0,    0,    0,  ..., 4183, 1750,  690],
        [   0,    0,    0,  ..., 3153, 5408, 5208],
        [   0,    0,    0,  ..., 1905, 5639,  614],
        ...,
        [   0,    0,    0,  ..., 4549, 1287,  282],
        [   0,    0,    0,  ..., 3507, 5045, 2892],
        [   0,    0,    0,  ...,  279, 4246, 2803]]), tensor([[   0,    0,    0,  ..., 1750,  690, 1826],
        [   0,    0,    0,  ..., 5408, 5208, 2940],
        [   0,    0,    0,  ..., 5639,  614, 1146],
        ...,
        [   0,    0,    0,  ..., 1287,  282, 1651],
        [   0,    0,    0,  ..., 5045, 2892, 1006],
        [   0,    0,    0,  ..., 4246, 2803,  737]]), tensor([[   0,    0,    0,  ..., 3246, 5423, 3585],
        [   0,    0,    0,  ...,   89, 1891,   60],
        [   0,    0,    0,  ..., 3051, 5472, 1113],
   

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

pandas 1.1.5
Sparsh A. 
last updated: 2022-01-22 12:31:25 

recohut 0.0.11

compiler   : GCC 7.5.0
system     : Linux
release    : 5.4.144+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
