In [None]:
# default_exp datasets.torch.session

# Session (torch) Datasets
> Session-based recommendation datasets in PyTorch Dataset format.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import pandas as pd
import csv
from collections import defaultdict

import recohut
from recohut.utils.common_utils import download_url

import torch
from torch.utils import data

In [None]:
#export
class Dataset(data.Dataset):
    def __init__(self, fpath, maxlen, is_train=True):
            [train, valid, test, itemnum] = self.data_partition(fpath)
            print("Number of sessions:",len(train)+len(valid)+len(test))
            print("Number of items:", itemnum)

            action = 0
            for i in train:
                action += np.count_nonzero(i)
            for i in valid:
                action += np.count_nonzero(i)
            for i in test:
                action += np.count_nonzero(i)

            print("Number of actions:", action)
            print("Average length of sessions:", action/(len(train)+len(valid)+len(test)))

            self.data = train if is_train else test
            self.maxlen = maxlen
            self.itemnum = itemnum
            self.is_train = is_train

    def __len__(self):
            return len(self.data)

    def __train__(self, index):
            session = np.asarray(self.data[index], dtype=np.int64)
            if len(session) > self.maxlen:
                session = session[-self.maxlen:]
            else:
                session = np.pad(session, (self.maxlen-len(session), 0), 'constant', constant_values=0)
            curr_seq = session[:-1]
            curr_pos = session[1:]
            return curr_seq, curr_pos
    
    def __test__(self, index):
            session = self.data[index]
            seq = np.zeros([self.maxlen], dtype=np.int64)
            idx = self.maxlen - 1
            for i in reversed(session[:-1]): #everything except the last one
                seq[idx] = i
                idx -= 1
                if idx == -1: break
            return seq, session[-1]-1 #index of the item in the list of all items

    def __getitem__(self, index):
            if self.is_train:
                return self.__train__(index)
            else:
                return self.__test__(index)

    @staticmethod
    def data_partition(fname, percentage=[0.1, 0.2]):
        itemnum = 0

        sessions = defaultdict(list)
        session_train = []
        session_valid = []
        session_test = []
        # assume user/item index starting from 1
        session_id = 0
        f = open(fname, 'r')
        total_length = 0
        max_length = 0
        for line in f:

            items = [int(l) for l in line.rstrip().split(',')]

            if len(items) < 5: continue
            total_length += len(items)

            if max_length< len(items):
                max_length = len(items)
            
            itemnum = max(max(items), itemnum)
            sessions[session_id].append(items)
            session_id += 1

        print("Avg length:", total_length/session_id)
        print("Maximum length:", max_length)

        valid_perc = percentage[0]
        test_perc = percentage[1]

        total_sessions = session_id
        
        shuffle_indices = np.random.permutation(range(total_sessions)) #
        
        train_index = int(total_sessions*(1 - valid_perc - test_perc))
        valid_index = int(total_sessions*(1 - test_perc))

        if (train_index == valid_index): valid_index += 1 #break the tie
        
        train_indices = shuffle_indices[:train_index]
        valid_indices = shuffle_indices[train_index:valid_index]
        test_indices = shuffle_indices[valid_index:]

        for i in train_indices:
            session_train.extend(sessions[i])
        for i in valid_indices:
            session_valid.extend(sessions[i])
        for i in test_indices:
            session_test.extend(sessions[i])
        
        return [np.asarray(session_train), np.asarray(session_valid), np.asarray(session_test), itemnum]

    @staticmethod
    def nextitnet_format(fname, maxlen):
            
        sessions = []

        # assume user/item index starting from 1
        f = open(fname, 'r')

        for line in f:

            items = [int(l) for l in line.rstrip().split(',')]

            if len(items) < 5: continue
            
            seq = np.zeros([maxlen], dtype=np.int32)
            
            idx = maxlen - 1

            for i in reversed(items):
                seq[idx] = i
                idx -= 1
                if idx == -1: break        
            
            sessions.append(seq)
            
        print("number of session:", len(sessions))

        return sessions

    @staticmethod
    def gru_format(fname, user_train, user_valid, user_test):
        
        session_id = 0
        train = []
        for session in user_train:
            for item in session:
                train.append([session_id, item, 0])
            session_id += 1

        valid = []
        for session in user_valid:
            for item in session:
                valid.append([session_id, item, 0])
            session_id += 1

        test = []
        for session in user_test:
            for item in session:
                test.append([session_id, item, 0])
            session_id += 1

        train_data = pd.DataFrame(train, columns= ['SessionId', 'ItemId', 'Time'])
        valid_data = pd.DataFrame(valid, columns= ['SessionId', 'ItemId', 'Time'])
        test_data = pd.DataFrame(test, columns= ['SessionId', 'ItemId', 'Time'])

        return train_data, valid_data, test_data

In [None]:
#export
class YoochooseDataset(Dataset):
    url = 'https://github.com/RecoHut-Datasets/yoochoose/raw/v3/yoochoose.csv'

    def __init__(self, root, maxlen, is_train=True):
        fpath = download_url(url=self.url, folder=root)
        super().__init__(fpath, maxlen, is_train)

In [None]:
dataset = YoochooseDataset(root='/content/yoochoose', maxlen=30)

sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples

Using existing file yoochoose.csv


Avg length: 10.0
Maximum length: 10
Number of sessions: 80183
Number of items: 12936
Number of actions: 406979
Average length of sessions: 5.075627003230111


[tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0, 10309, 10309, 10309],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,   794,  5005,  6891],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0, 10631,  4104,  9852],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,  9469,  9486,  9469],
         [    0,     0,     0,     0,     0,     0,     0,     0

In [None]:
#export
class NowplayingDataset(Dataset):
    url = 'https://github.com/RecoHut-Datasets/nowplaying/raw/v3/nowplaying.csv'

    def __init__(self, root, maxlen, is_train=True):
        fpath = download_url(url=self.url, folder=root)
        super().__init__(fpath, maxlen, is_train)

In [None]:
dataset = NowplayingDataset(root='/content/nowplaying', maxlen=30)

sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples

Downloading https://github.com/RecoHut-Datasets/nowplaying/raw/v3/nowplaying.csv


Avg length: 20.0
Maximum length: 20
Number of sessions: 113918
Number of items: 239221
Number of actions: 1184815
Average length of sessions: 10.400595164943205


[tensor([[     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0, 114002, 113983,  89621, 113960, 113884, 113926, 114000, 113738,
          113930,   3168, 113805, 113800, 113789, 113872, 114018, 113881, 113869,
          113776,  21568],
         [     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,  13653,  11910,  28131,
            4896,  33231],
         [     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0, 217911, 218397,  23439,  23684,  40048,  23439,  22123, 218298,
           58345, 218399],
         [     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,

In [None]:
#export
class DigineticaDataset(Dataset):
    url = 'https://github.com/RecoHut-Datasets/diginetica/raw/v4/diginetica.csv'

    def __init__(self, root, maxlen, is_train=True):
        fpath = download_url(url=self.url, folder=root)
        super().__init__(fpath, maxlen, is_train)

In [None]:
dataset = DigineticaDataset(root='/content/diginetica', maxlen=30)

sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples

Downloading https://github.com/RecoHut-Datasets/diginetica/raw/v4/diginetica.csv


Avg length: 8.777109003245833
Maximum length: 70
Number of sessions: 63466
Number of items: 38970


  return array(a, dtype, copy=False, order=order)


Number of actions: 557048
Average length of sessions: 8.777109003245833


[tensor([[    0,     0,     0,     0,     0,     0,     0,     0,  2387,  2245,
           9141,  2366,  9142,  9143,  3193,  3193,  1726,  1725,  2366,  1722,
           2366,  2366,  9144,  3197,  9145,  1722,  9146,  9147,  9146],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0, 17095, 17101, 17094, 17100, 17096],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0, 13816,
          10789,  9204, 11198, 23151,  8289, 30676,  3372, 30678, 14125],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,  4814, 10013, 18788,  9285, 14081],
         [    0,     0,     0,     0,     0,     0,     0,     0

In [None]:
#export
class LastfmDataset(Dataset):
    url = 'https://github.com/RecoHut-Datasets/lastfm/raw/v2/last_fm.csv'

    def __init__(self, root, maxlen, is_train=True):
        fpath = download_url(url=self.url, folder=root)
        super().__init__(fpath, maxlen, is_train)

In [None]:
dataset = LastfmDataset(root='/content/lastfm', maxlen=30)

sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples

Downloading https://github.com/RecoHut-Datasets/lastfm/raw/v2/last_fm.csv


Avg length: 17.447849599510228
Maximum length: 49


  return array(a, dtype, copy=False, order=order)


Number of sessions: 196010
Number of items: 107391
Number of actions: 3419953
Average length of sessions: 17.447849599510228


[tensor([[     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,   3821,     96,   3821,   1600,     96,
            3366,   3821,     96,   3366,  18639,   3821,   3280,   3366,     96,
            3366,   1600],
         [     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,  13864,
           35393,  13864,  50765,  13743,  51628,  34165,  44702,  62996,   9504,
          106404,  13864],
         [     0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,   3363,   1875,
            2782,   1875],
         [   973,   2740,   2712,  17892,   2228,    829,   2740,    128,    744,
            1193,   1284,   2755,   1443,   4028,   2712,   7635,    620,   1861,
            3978,

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-29 07:54:45

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

torch  : 1.10.0+cu111
recohut: 0.0.8
csv    : 1.0
pandas : 1.1.5
numpy  : 1.19.5
IPython: 5.5.0

