In [None]:
# default_exp datasets.aotm

# AOTM dataset
> AOTM dataset.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from typing import List, Optional, Callable, Union, Any, Tuple

import os
import os.path as osp
from collections.abc import Sequence
import sys

import numpy as np
import pandas as pd
from datetime import timezone, datetime, timedelta
import time

from recohut.datasets.bases.session import SessionDataset
from recohut.utils.common_utils import download_url, extract_zip, makedirs

In [None]:
#export
class AOTMDataset(SessionDataset):
    url = 'https://github.com/RecoHut-Datasets/aotm/raw/v1/aotm.zip'
    
    def __init__(self, root, process_method, min_session_length=2, min_item_support=2,
                 num_slices=5, days_offset=0, days_shift=95, days_train=90, days_test=5):
        min_date = session_length = None
        super().__init__(root, process_method, min_date, session_length, 
                         min_session_length, min_item_support, num_slices, days_offset, 
                         days_shift, days_train, days_test)
    
    @property
    def raw_file_names(self) -> str:
        return 'playlists-aotm.csv'

    @property
    def processed_file_names(self) -> str:
        return 'dataset.pkl'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(osp.join(self.raw_dir, 'aotm', 'raw', 'playlists-aotm.csv'),
             osp.join(self.raw_dir, 'playlists-aotm.csv'))
        rmtree(osp.join(self.raw_dir, 'aotm'))
        os.unlink(path)

    def load(self):
        #load csv
        data = pd.read_csv(osp.join(self.raw_dir,self.raw_file_names), sep='\t')
        data.sort_values(by=['SessionId','Time'], inplace=True)
        
        #output
        data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
        data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)
        
        print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
            format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat()))
    
        self.data = data

In [None]:
!rm -r /content/aotm
aotmdata = AOTMDataset(root='/content/aotm', process_method='last')

Downloading https://github.com/RecoHut-Datasets/aotm/raw/v1/aotm.zip
Extracting /content/aotm/raw/aotm.zip
Processing...


Loaded data set
	Events: 1821241
	Sessions: 93313
	Items: 765790
	Span: 2016-01-02 / 2016-12-30


Filtered data set
	Events: 1192938
	Sessions: 87654
	Items: 138815
	Span: 2016-01-02 / 2016-12-30


Full train set
	Events: 1189593
	Sessions: 87410
	Items: 138815
Test set
	Events: 3345
	Sessions: 244
	Items: 3105
Train set
	Events: 1185992
	Sessions: 87145
	Items: 138814
Validation set
	Events: 3600
	Sessions: 265
	Items: 3363


Done!


In [None]:
!tree --du -h -C /content/aotm

[01;34m/content/aotm[00m
├── [ 83M]  [01;34mprocessed[00m
│   ├── [120K]  events_test.txt
│   ├── [ 42M]  events_train_full.txt
│   ├── [ 41M]  events_train_tr.txt
│   └── [129K]  events_train_valid.txt
└── [ 65M]  [01;34mraw[00m
    └── [ 65M]  playlists-aotm.csv

 149M used in 2 directories, 5 files


In [None]:
!rm -r /content/aotm
aotmdata = AOTMDataset(root='/content/aotm', process_method='days_test')

Downloading https://github.com/RecoHut-Datasets/aotm/raw/v1/aotm.zip
Extracting /content/aotm/raw/aotm.zip
Processing...


Loaded data set
	Events: 1821241
	Sessions: 93313
	Items: 765790
	Span: 2016-01-02 / 2016-12-30


Filtered data set
	Events: 1192938
	Sessions: 87654
	Items: 138815
	Span: 2016-01-02 / 2016-12-30


Full train set
	Events: 1176744
	Sessions: 86474
	Items: 138786
Test set
	Events: 16138
	Sessions: 1179
	Items: 12841


Done!


In [None]:
!tree --du -h -C /content/aotm

[01;34m/content/aotm[00m
├── [ 42M]  [01;34mprocessed[00m
│   ├── [578K]  events_test.txt
│   └── [ 41M]  events_train_full.txt
└── [ 65M]  [01;34mraw[00m
    └── [ 65M]  playlists-aotm.csv

 107M used in 2 directories, 3 files


In [None]:
!rm -r /content/aotm/processed/*
aotmdata = AOTMDataset(root='/content/aotm', process_method='slice')

Processing...


Loaded data set
	Events: 1821241
	Sessions: 93313
	Items: 765790
	Span: 2016-01-02 / 2016-12-30


Filtered data set
	Events: 1192938
	Sessions: 87654
	Items: 138815
	Span: 2016-01-02 / 2016-12-30


Full data set 0
	Events: 1192938
	Sessions: 87654
	Items: 138815
	Span: 2016-01-02T23:00:00+00:00 / 2016-12-30T23:00:42+00:00
Slice data set 0
	Events: 315346
	Sessions: 23197
	Items: 95205
	Span: 2016-01-02 / 2016-04-01 / 2016-04-06
Train set 0
	Events: 298510
	Sessions: 21960
	Items: 92778
	Span: 2016-01-02 / 2016-04-01
Test set 0
	Events: 14333
	Sessions: 1213
	Items: 10851
	Span: 2016-04-01 / 2016-04-06 


Full data set 1
	Events: 1192938
	Sessions: 87654
	Items: 138815
	Span: 2016-01-02T23:00:00+00:00 / 2016-12-30T23:00:42+00:00
Slice data set 1
	Events: 310422
	Sessions: 22778
	Items: 94240
	Span: 2016-04-06 / 2016-07-05 / 2016-07-10
Train set 1
	Events: 294246
	Sessions: 21594
	Items: 91873
	Span: 2016-04-06 / 2016-07-05
Test set 1
	Events: 13733
	Sessions: 1163
	Items: 10441
	Span: 2

Done!


In [None]:
!tree --du -h -C /content/aotm

[01;34m/content/aotm[00m
├── [ 41M]  [01;34mprocessed[00m
│   ├── [513K]  events_test.0.txt
│   ├── [491K]  events_test.1.txt
│   ├── [491K]  events_test.2.txt
│   ├── [  38]  events_test.3.txt
│   ├── [  38]  events_test.4.txt
│   ├── [ 10M]  events_train_full.0.txt
│   ├── [ 10M]  events_train_full.1.txt
│   ├── [ 10M]  events_train_full.2.txt
│   ├── [9.0M]  events_train_full.3.txt
│   └── [  38]  events_train_full.4.txt
└── [ 65M]  [01;34mraw[00m
    └── [ 65M]  playlists-aotm.csv

 107M used in 2 directories, 11 files


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-22 09:23:49

recohut: 0.0.5

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
IPython: 5.5.0
numpy  : 1.19.5
sys    : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]

