In [None]:
# default_exp datasets.nowplaying

# NowPlaying dataset
> NowPlaying dataset.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from typing import List, Optional, Callable, Union, Any, Tuple

import os
import os.path as osp
from collections.abc import Sequence
import sys

import numpy as np
import pandas as pd
from datetime import timezone, datetime, timedelta
import time

from recohut.datasets.bases.session import SessionDataset
from recohut.utils.common_utils import download_url, extract_zip, makedirs

In [None]:
#export
class NowPlayingDataset(SessionDataset):
    url = 'https://github.com/RecoHut-Datasets/nowplaying/raw/v2/nowplaying.zip'
    
    def __init__(self, root, process_method, min_session_length=2, min_item_support=2,
                 num_slices=5, days_offset=0, days_shift=95, days_train=90, days_test=5):
        min_date = session_length = None
        super().__init__(root, process_method, min_date, session_length, 
                         min_session_length, min_item_support, num_slices, days_offset, 
                         days_shift, days_train, days_test)
    
    @property
    def raw_file_names(self) -> str:
        return 'nowplaying.csv'

    @property
    def processed_file_names(self) -> str:
        return 'dataset.pkl'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(osp.join(self.raw_dir, 'nowplaying', 'raw', 'nowplaying.csv'),
             osp.join(self.raw_dir, 'nowplaying.csv'))
        rmtree(osp.join(self.raw_dir, 'nowplaying'))
        os.unlink(path)

    def load(self):
        #load csv
        data = pd.read_csv(osp.join(self.raw_dir,self.raw_file_names), sep='\t')
        data.sort_values(by=['SessionId','Time'], inplace=True)
        
        #output
        data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
        data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)
        
        print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
            format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat()))
    
        self.data = data

In [None]:
!rm -r /content/nowplaying/processed/*
npdata = NowPlayingDataset(root='/content/nowplaying', process_method='last')

Downloading https://github.com/RecoHut-Datasets/nowplaying/raw/v2/nowplaying.zip
Extracting /content/nowplaying/raw/nowplaying.zip
Processing...


Loaded data set
	Events: 1587776
	Sessions: 156958
	Items: 258322
	Span: 2014-01-03 / 2015-06-22


Filtered data set
	Events: 1471664
	Sessions: 153383
	Items: 145045
	Span: 2014-01-03 / 2015-06-22


Full train set
	Events: 1469377
	Sessions: 153168
	Items: 145037
Test set
	Events: 2262
	Sessions: 215
	Items: 1841
Train set
	Events: 1466700
	Sessions: 152892
	Items: 145019
Validation set
	Events: 2634
	Sessions: 276
	Items: 2238


Done!


In [None]:
!tree --du -h -C /content/nowplaying

[01;34m/content/nowplaying[00m
├── [ 96M]  [01;34mprocessed[00m
│   ├── [ 76K]  events_test.txt
│   ├── [ 48M]  events_train_full.txt
│   ├── [ 48M]  events_train_tr.txt
│   └── [ 89K]  events_train_valid.txt
└── [ 54M]  [01;34mraw[00m
    └── [ 54M]  nowplaying.csv

 150M used in 2 directories, 5 files


In [None]:
!rm -r /content/nowplaying/processed/*
npdata = NowPlayingDataset(root='/content/nowplaying', process_method='days_test')

Processing...


Loaded data set
	Events: 1587776
	Sessions: 156958
	Items: 258322
	Span: 2014-01-03 / 2015-06-22


Filtered data set
	Events: 1471664
	Sessions: 153383
	Items: 145045
	Span: 2014-01-03 / 2015-06-22


Full train set
	Events: 1459201
	Sessions: 152086
	Items: 144863
Test set
	Events: 11993
	Sessions: 1286
	Items: 8449


Done!


In [None]:
!tree --du -h -C /content/nowplaying

[01;34m/content/nowplaying[00m
├── [ 48M]  [01;34mprocessed[00m
│   ├── [406K]  events_test.txt
│   └── [ 48M]  events_train_full.txt
└── [ 54M]  [01;34mraw[00m
    └── [ 54M]  nowplaying.csv

 102M used in 2 directories, 3 files


In [None]:
!rm -r /content/nowplaying/processed/*
npdata = NowPlayingDataset(root='/content/nowplaying', process_method='slice')

Processing...


Loaded data set
	Events: 1587776
	Sessions: 156958
	Items: 258322
	Span: 2014-01-03 / 2015-06-22


Filtered data set
	Events: 1471664
	Sessions: 153383
	Items: 145045
	Span: 2014-01-03 / 2015-06-22


Full data set 0
	Events: 1471664
	Sessions: 153383
	Items: 145045
	Span: 2014-01-03T00:56:05+00:00 / 2015-06-22T01:09:22+00:00
Slice data set 0
	Events: 181916
	Sessions: 19510
	Items: 57310
	Span: 2014-01-03 / 2014-04-03 / 2014-04-08
Train set 0
	Events: 162294
	Sessions: 17418
	Items: 53350
	Span: 2014-01-03 / 2014-04-03
Test set 0
	Events: 14641
	Sessions: 1930
	Items: 8273
	Span: 2014-04-03 / 2014-04-08 


Full data set 1
	Events: 1471664
	Sessions: 153383
	Items: 145045
	Span: 2014-01-03T00:56:05+00:00 / 2015-06-22T01:09:22+00:00
Slice data set 1
	Events: 306065
	Sessions: 33152
	Items: 76007
	Span: 2014-04-08 / 2014-07-07 / 2014-07-12
Train set 1
	Events: 289659
	Sessions: 31264
	Items: 73786
	Span: 2014-04-08 / 2014-07-07
Test set 1
	Events: 13730
	Sessions: 1812
	Items: 8835
	Span:

Done!


In [None]:
!tree --du -h -C /content/nowplaying

[01;34m/content/nowplaying[00m
├── [ 43M]  [01;34mprocessed[00m
│   ├── [486K]  events_test.0.txt
│   ├── [458K]  events_test.1.txt
│   ├── [474K]  events_test.2.txt
│   ├── [  36]  events_test.3.txt
│   ├── [415K]  events_test.4.txt
│   ├── [5.3M]  events_train_full.0.txt
│   ├── [9.5M]  events_train_full.1.txt
│   ├── [9.7M]  events_train_full.2.txt
│   ├── [7.1M]  events_train_full.3.txt
│   └── [9.2M]  events_train_full.4.txt
└── [ 54M]  [01;34mraw[00m
    └── [ 54M]  nowplaying.csv

  96M used in 2 directories, 11 files


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-22 09:23:49

recohut: 0.0.5

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
IPython: 5.5.0
numpy  : 1.19.5
sys    : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]

