In [None]:
# default_exp datasets.music30

# Music30 dataset
> Music30 dataset.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from typing import List, Optional, Callable, Union, Any, Tuple

import os
import os.path as osp
from collections.abc import Sequence
import sys

import numpy as np
import pandas as pd
from datetime import timezone, datetime, timedelta
import time

from recohut.datasets.base import SessionDataset
from recohut.utils.common_utils import download_url, extract_zip, makedirs

In [None]:
#export
class Music30Dataset(SessionDataset):
    url = 'https://github.com/RecoHut-Datasets/30music/raw/v1/30music.zip'
    
    def __init__(self, root, process_method, min_session_length=2, min_item_support=2,
                 num_slices=5, days_offset=0, days_shift=95, days_train=90, days_test=5):
        min_date = session_length = None
        super().__init__(root, process_method, min_date, session_length, 
                         min_session_length, min_item_support, num_slices, days_offset, 
                         days_shift, days_train, days_test)
    
    @property
    def raw_file_names(self) -> str:
        return '30music-200ks.csv'

    @property
    def processed_file_names(self) -> str:
        return 'dataset.pkl'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(osp.join(self.raw_dir, '30music', 'raw', '30music-200ks.csv'),
             osp.join(self.raw_dir, '30music-200ks.csv'))
        rmtree(osp.join(self.raw_dir, '30music'))
        os.unlink(path)

    def load(self):
        #load csv
        data = pd.read_csv(osp.join(self.raw_dir,self.raw_file_names), sep='\t')
        data.sort_values(by=['SessionId','Time'], inplace=True)
        
        #output
        data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
        data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)
        
        print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
            format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat()))
    
        self.data = data

In [None]:
musicdata = Music30Dataset(root='/content/music30', process_method='last')

Downloading https://github.com/RecoHut-Datasets/30music/raw/v1/30music.zip
Extracting /content/music30/raw/30music.zip
Processing...


Loaded data set
	Events: 3707857
	Sessions: 200000
	Items: 1203432
	Span: 2014-01-20 / 2015-01-20


Filtered data set
	Events: 2953382
	Sessions: 190216
	Items: 452855
	Span: 2014-01-20 / 2015-01-20


Full train set
	Events: 2892862
	Sessions: 186627
	Items: 450895
Test set
	Events: 54606
	Sessions: 3468
	Items: 35100
Train set
	Events: 2847481
	Sessions: 183674
	Items: 449290
Validation set
	Events: 41785
	Sessions: 2852
	Items: 29293


Done!


In [None]:
musicdata = Music30Dataset(root='/content/music30', process_method='last')

Processing...


Loaded data set
	Events: 3707857
	Sessions: 200000
	Items: 1203432
	Span: 2014-01-20 / 2015-01-20


Filtered data set
	Events: 2149666
	Sessions: 165766
	Items: 139016
	Span: 2014-01-20 / 2015-01-20


Full train set
	Events: 2105847
	Sessions: 162634
	Items: 138861
Test set
	Events: 41871
	Sessions: 3091
	Items: 23508
Train set
	Events: 2073194
	Sessions: 160047
	Items: 138755
Validation set
	Events: 31937
	Sessions: 2564
	Items: 20210


Done!


In [None]:
!tree --du -h -C /content/music30

[01;34m/content/music30[00m
├── [157M]  [01;34mprocessed[00m
│   ├── [1.6M]  events_test.txt
│   ├── [ 78M]  events_train_full.txt
│   ├── [ 77M]  events_train_tr.txt
│   └── [1.2M]  events_train_valid.txt
└── [137M]  [01;34mraw[00m
    └── [137M]  30music-200ks.csv

 295M used in 2 directories, 5 files


In [None]:
!rm /content/music30/processed/*
musicdata = Music30Dataset(root='/content/music30', process_method='days_test')

Processing...


Loaded data set
	Events: 3707857
	Sessions: 200000
	Items: 1203432
	Span: 2014-01-20 / 2015-01-20


Filtered data set
	Events: 2149666
	Sessions: 165766
	Items: 139016
	Span: 2014-01-20 / 2015-01-20


Full train set
	Events: 2073194
	Sessions: 160047
	Items: 138755
Test set
	Events: 73532
	Sessions: 5652
	Items: 36423


Done!


In [None]:
!tree --du -h -C /content/music30

[01;34m/content/music30[00m
├── [ 79M]  [01;34mprocessed[00m
│   ├── [2.7M]  events_test.txt
│   └── [ 77M]  events_train_full.txt
└── [137M]  [01;34mraw[00m
    └── [137M]  30music-200ks.csv

 217M used in 2 directories, 3 files


In [None]:
!rm /content/music30/processed/*
musicdata = Music30Dataset(root='/content/music30', process_method='slice')

Processing...


Loaded data set
	Events: 3707857
	Sessions: 200000
	Items: 1203432
	Span: 2014-01-20 / 2015-01-20


Filtered data set
	Events: 2149666
	Sessions: 165766
	Items: 139016
	Span: 2014-01-20 / 2015-01-20




Done!


In [None]:
!tree --du -h -C /content/music30

[01;34m/content/music30[00m
├── [4.0K]  [01;34mprocessed[00m
└── [137M]  [01;34mraw[00m
    └── [137M]  30music-200ks.csv

 137M used in 2 directories, 1 file


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-22 09:23:49

recohut: 0.0.5

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
IPython: 5.5.0
numpy  : 1.19.5
sys    : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]

