In [None]:
# default_exp datasets.sample_session

# Sample Session dataset
> Small sample of session dataset.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from typing import List, Optional, Callable, Union, Any, Tuple

import os
import os.path as osp
from collections.abc import Sequence
import sys
import csv
import pickle
import math
import operator

import numpy as np
from datetime import timezone, datetime, timedelta
import time

from recohut.datasets.bases.session import SessionDatasetv2
from recohut.datasets.bases.session_graph import SessionGraphDataset
from recohut.utils.common_utils import download_url

In [None]:
#export
class SampleDataset(SessionDatasetv2):
    url = 'https://github.com/RecoHut-Datasets/sample_session/raw/v1/sample_train-item-views.csv'

    def __init__(self, root, column_names={'SESSION_ID':'session_id',
                                        'ITEM_ID': 'item_id',
                                        'TIMEFRAME': 'timeframe',
                                        'EVENT_DATE': 'eventdate'}):
        super().__init__(root, column_names)

    @property
    def raw_file_names(self) -> str:
        return 'sample_train-item-views.csv'

    def download(self):
        path = download_url(self.url, self.raw_dir)

In [None]:
ds = SampleDataset(root='/content/samplesession')

In [None]:
#exporti
def data_masks(all_usr_pois, item_tail):
    us_lens = [len(upois) for upois in all_usr_pois]
    len_max = max(us_lens)
    us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
    us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
    return us_pois, us_msks, len_max

In [None]:
#exporti
def split_validation(train_set, valid_portion):
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = np.arange(n_samples, dtype='int32')
    np.random.shuffle(sidx)
    n_train = int(np.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    return (train_set_x, train_set_y), (valid_set_x, valid_set_y)

In [None]:
train_data = pickle.load(open('/content/samplesession/processed/train.txt', 'rb'))
train_data[0][:10]

[[1, 2], [1], [4], [6], [8, 9], [8], [10, 11, 11], [10, 11], [10], [12]]

In [None]:
len(train_data[0])

1205

In [None]:
train_data, valid_data = split_validation(train_data, valid_portion=0.1)
test_data = valid_data

train_data = base.GraphDataset(train_data, shuffle=True)
test_data = base.GraphDataset(test_data, shuffle=False)

In [None]:
train_data.generate_batch(10)[:5]

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
 array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39]),
 array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49])]

In [None]:
#export
class SampleDatasetv2(SessionGraphDataset):
    train_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/train.txt"
    test_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/test.txt"
    all_train_seq_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/all_train_seq.txt"

    def __init__(self, root, shuffle=False, n_node=309, is_train=True):
        self.n_node = n_node
        self.shuffle = shuffle
        self.is_train = is_train
        super().__init__(root, shuffle, n_node)

    @property
    def raw_file_names(self) -> str:
        if self.is_train:
            return ['train.txt', 'all_train_seq.txt']
        return ['test.txt', 'all_train_seq.txt']

    def download(self):
        download_url(self.all_train_seq_url, self.raw_dir)
        if self.is_train:
            download_url(self.train_url, self.raw_dir)
        else:
            download_url(self.test_url, self.raw_dir)

In [None]:
root = '/content/samplesessionv2'

train_data = SampleDatasetv2(root=root, shuffle=True, is_train=True)
test_data = SampleDatasetv2(root=root, shuffle=False, is_train=False)

Downloading https://github.com/RecoHut-Datasets/sample_session/raw/v2/all_train_seq.txt
Downloading https://github.com/RecoHut-Datasets/sample_session/raw/v2/train.txt
  return array(a, dtype, copy=False, order=order)
Using existing file all_train_seq.txt
Downloading https://github.com/RecoHut-Datasets/sample_session/raw/v2/test.txt
  return array(a, dtype, copy=False, order=order)


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-30 07:07:57

recohut: 0.0.8

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

sys    : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
numpy  : 1.19.5
IPython: 5.5.0
csv    : 1.0
recohut: 0.0.8

