In [None]:
# default_exp datasets.yoochoose

# Yoochoose
> Yoochoose dataset.

The dataset is session-based and each session contains a sequence of clicks and purchases. Since the Yoochoose dataset is too large, in some cases we only use its the most recent 1/64 fractions of the training sessions, denoted as Yoochoose 1/64.

Raw data: [https://www.dropbox.com/sh/n281js5mgsvao6s/AADQbYxSFVPCun5DfwtsSxeda?dl=0](https://www.dropbox.com/sh/n281js5mgsvao6s/AADQbYxSFVPCun5DfwtsSxeda?dl=0)

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
import numpy as np
import pandas as pd
import datetime
import os
import os.path as osp

from recohut.datasets.bases.session import SessionDataset
from recohut.utils.common_utils import extract_zip

In [None]:
#export
class YoochooseDataset(SessionDataset):
    data_id = '1UEcKC4EfgMVD2n_zBvAyp0vRNyv7ndSF'

    def __init__(self,
                 root,
                 min_session_length: int = 2,
                 min_item_support: int = 5,
                 eval_sec: int = 86400,
                 ):
        super().__init__(root, min_session_length, min_item_support, eval_sec)

    @property
    def raw_file_names(self) -> str:
        return 'rsc15-clicks.dat'

    @property
    def processed_file_names(self) -> str:
        return ['yoochoose_train.txt','yoochoose_valid.txt']

    def download(self):
        from google_drive_downloader import GoogleDriveDownloader as gdd
        from shutil import move, rmtree

        path = osp.join(self.raw_dir, 'rsc15.zip')
        gdd.download_file_from_google_drive(self.data_id, path)
        extract_zip(path, self.raw_dir)
        move(osp.join(self.raw_dir, 'rsc15', 'raw', self.raw_file_names),
             osp.join(self.raw_dir, self.raw_file_names))
        rmtree(osp.join(self.raw_dir, 'rsc15'))
        os.unlink(path)

    def process(self):
        df = self.load_ratings_df()
        if self.min_session_length is not None:
            df = self.remove_short_sessions(df)
        if self.min_item_support is not None:
            df = self.remove_sparse_items(df)
        train, test = self.split_df(df)
        train.to_csv(self.processed_paths[0], sep=',', index=False)
        test.to_csv(self.processed_paths[1], sep=',', index=False)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], header=None, usecols=[0, 1, 2],
                         dtype={0: np.int32, 1: str, 2: np.int64})
        df.columns = ['uid', 'timestamp', 'sid']
        df['timestamp'] = df['timestamp'].apply(lambda x: datetime.datetime.strptime(
            x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
        return df

In [None]:
ds = YoochooseDataset(root='/content/yoochoose')

Processing...


Training Set has 31637239 Events, 7966257 Sessions, and 37483 Items


Validation Set has 71222 Events, 15324 Sessions, and 6751 Items




Done!


In [None]:
!tree --du -h -C /content/yoochoose

[01;34m/content/yoochoose[00m
├── [995M]  [01;34mprocessed[00m
│   ├── [993M]  yoochoose_train.txt
│   └── [2.3M]  yoochoose_valid.txt
└── [1.4G]  [01;34mraw[00m
    └── [1.4G]  rsc15-clicks.dat

 2.4G used in 2 directories, 3 files


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-31 12:52:36

recohut: 0.0.8

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy  : 1.19.5
IPython: 5.5.0
pandas : 1.1.5

