In [None]:
# default_exp datasets.bases.session_graph

# Session Graph dataset
> Base class for Session Graph dataset module.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from typing import List, Optional, Callable, Union, Any, Tuple

import numpy as np
import pandas as pd

import torch

from recohut.datasets.bases.common import Dataset
from recohut.utils.common_utils import *

In [None]:
#export
class SessionGraphDataset(Dataset):
    """
    References
        1. COTREC session-based recommender model training. https://t.ly/cXTH.
    """
    def __init__(self, root, shuffle=False, n_node=None):
        super().__init__(root)
        self.n_node = n_node
        self.shuffle = shuffle
        self.process()

    @property
    def raw_file_names(self) -> str:
        return ['data.txt', 'all_seq.txt']

    @property
    def processed_file_names(self) -> str:
        pass

    def download(self):
        raise NotImplementedError

    def process(self):
        import pickle
        data = pickle.load(open(self.raw_paths[0], 'rb'))
        all_seq = pickle.load(open(self.raw_paths[1], 'rb'))
        self.raw = np.asarray(data[0])
        self.targets = np.asarray(data[1])
        self.length = len(self.raw)
        adj = self.data_masks(all_seq, self.n_node)
        self.adjacency = adj.multiply(1.0/adj.sum(axis=0).reshape(1, -1))


    def get_overlap(self, sessions):
        matrix = np.zeros((len(sessions), len(sessions)))
        for i in range(len(sessions)):
            seq_a = set(sessions[i])
            seq_a.discard(0)
            for j in range(i+1, len(sessions)):
                seq_b = set(sessions[j])
                seq_b.discard(0)
                overlap = seq_a.intersection(seq_b)
                ab_set = seq_a | seq_b
                matrix[i][j] = float(len(overlap))/float(len(ab_set))
                matrix[j][i] = matrix[i][j]
        # matrix = self.dropout(matrix, 0.2)
        matrix = matrix + np.diag([1.0]*len(sessions))
        degree = np.sum(np.array(matrix), 1)
        degree = np.diag(1.0/degree)
        return matrix, degree

    def generate_batch(self, batch_size):
        if self.shuffle:
            shuffled_arg = np.arange(self.length)
            np.random.shuffle(shuffled_arg)
            self.raw = self.raw[shuffled_arg]
            self.targets = self.targets[shuffled_arg]
        n_batch = int(self.length / batch_size)
        if self.length % batch_size != 0:
            n_batch += 1
        slices = np.split(np.arange(n_batch * batch_size), n_batch)
        slices[-1] = np.arange(self.length-batch_size, self.length)
        return slices

    def get_slice(self, index):
        items, num_node = [], []
        inp = self.raw[index]
        for session in inp:
            num_node.append(len(np.nonzero(session)[0]))
        max_n_node = np.max(num_node)
        session_len = []
        reversed_sess_item = []
        mask = []
        # item_set = set()
        for session in inp:
            nonzero_elems = np.nonzero(session)[0]
            # item_set.update(set([t-1 for t in session]))
            session_len.append([len(nonzero_elems)])
            items.append(session + (max_n_node - len(nonzero_elems)) * [0])
            mask.append([1]*len(nonzero_elems) + (max_n_node - len(nonzero_elems)) * [0])
            reversed_sess_item.append(list(reversed(session)) + (max_n_node - len(nonzero_elems)) * [0])
        # item_set = list(item_set)
        # index_list = [item_set.index(a) for a in self.targets[index]-1]
        diff_mask = np.ones(shape=[100, self.n_node]) * (1/(self.n_node - 1))
        for count, value in enumerate(self.targets[index]-1):
            diff_mask[count][value] = 1
        return self.targets[index]-1, session_len,items, reversed_sess_item, mask, diff_mask
    
    @staticmethod
    def data_masks(all_sessions, n_node):
        adj = dict()
        for sess in all_sessions:
            for i, item in enumerate(sess):
                if i == len(sess)-1:
                    break
                else:
                    if sess[i] - 1 not in adj.keys():
                        adj[sess[i]-1] = dict()
                        adj[sess[i]-1][sess[i]-1] = 1
                        adj[sess[i]-1][sess[i+1]-1] = 1
                    else:
                        if sess[i+1]-1 not in adj[sess[i]-1].keys():
                            adj[sess[i] - 1][sess[i + 1] - 1] = 1
                        else:
                            adj[sess[i]-1][sess[i+1]-1] += 1
        row, col, data = [], [], []
        for i in adj.keys():
            item = adj[i]
            for j in item.keys():
                row.append(i)
                col.append(j)
                data.append(adj[i][j])
        from scipy.sparse import coo_matrix
        coo = coo_matrix((data, (row, col)), shape=(n_node, n_node))
        return coo

Examples

In [None]:
class DigineticaDataset(SessionGraphDataset):
    train_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/train.txt"
    test_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/test.txt"
    all_train_seq_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/all_train_seq.txt"

    def __init__(self, root, shuffle=False, n_node=43097, is_train=True):
        self.n_node = n_node
        self.shuffle = shuffle
        self.is_train = is_train
        super().__init__(root, shuffle, n_node)

    @property
    def raw_file_names(self) -> str:
        if self.is_train:
            return ['train.txt', 'all_train_seq.txt']
        return ['test.txt', 'all_train_seq.txt']

    def download(self):
        download_url(self.all_train_seq_url, self.raw_dir)
        if self.is_train:
            download_url(self.train_url, self.raw_dir)
        else:
            download_url(self.test_url, self.raw_dir)

In [None]:
root = '/content/diginetica'

train_data = DigineticaDataset(root=root, shuffle=True, is_train=True)
test_data = DigineticaDataset(root=root, shuffle=False, is_train=False)

In [None]:
class TmallDataset(SessionGraphDataset):
    train_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/train.txt"
    test_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/test.txt"
    all_train_seq_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/all_train_seq.txt"

    def __init__(self, root, shuffle=False, n_node=40727, is_train=True):
        self.n_node = n_node
        self.shuffle = shuffle
        self.is_train = is_train
        super().__init__(root, shuffle, n_node)

    @property
    def raw_file_names(self) -> str:
        if self.is_train:
            return ['train.txt', 'all_train_seq.txt']
        return ['test.txt', 'all_train_seq.txt']

    def download(self):
        download_url(self.all_train_seq_url, self.raw_dir)
        if self.is_train:
            download_url(self.train_url, self.raw_dir)
        else:
            download_url(self.test_url, self.raw_dir)

In [None]:
root = '/content/tmall'

train_data = TmallDataset(root=root, shuffle=True, is_train=True)
test_data = TmallDataset(root=root, shuffle=False, is_train=False)

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [None]:
class RetailRocketDataset(SessionGraphDataset):
    train_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/train.txt"
    test_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/test.txt"
    all_train_seq_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/all_train_seq.txt"

    def __init__(self, root, shuffle=False, n_node=40727, is_train=True):
        self.n_node = n_node
        self.shuffle = shuffle
        self.is_train = is_train
        super().__init__(root, shuffle, n_node)

    @property
    def raw_file_names(self) -> str:
        if self.is_train:
            return ['train.txt', 'all_train_seq.txt']
        return ['test.txt', 'all_train_seq.txt']

    def download(self):
        download_url(self.all_train_seq_url, self.raw_dir)
        if self.is_train:
            download_url(self.train_url, self.raw_dir)
        else:
            download_url(self.test_url, self.raw_dir)

In [None]:
root = '/content/retail_rocket'

train_data = RetailRocketDataset(root=root, shuffle=True, is_train=True)
test_data = RetailRocketDataset(root=root, shuffle=False, is_train=False)

Downloading https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/all_train_seq.txt
Downloading https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/train.txt
Using existing file all_train_seq.txt
Downloading https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/test.txt


In [None]:
class SampleDataset(SessionGraphDataset):
    train_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/train.txt"
    test_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/test.txt"
    all_train_seq_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/all_train_seq.txt"

    def __init__(self, root, shuffle=False, n_node=309, is_train=True):
        self.n_node = n_node
        self.shuffle = shuffle
        self.is_train = is_train
        super().__init__(root, shuffle, n_node)

    @property
    def raw_file_names(self) -> str:
        if self.is_train:
            return ['train.txt', 'all_train_seq.txt']
        return ['test.txt', 'all_train_seq.txt']

    def download(self):
        download_url(self.all_train_seq_url, self.raw_dir)
        if self.is_train:
            download_url(self.train_url, self.raw_dir)
        else:
            download_url(self.test_url, self.raw_dir)

In [None]:
root = '/content/sample'

train_data = SampleDataset(root=root, shuffle=True, is_train=True)
test_data = SampleDataset(root=root, shuffle=False, is_train=False)

  return array(a, dtype, copy=False, order=order)
Using existing file all_train_seq.txt
Downloading https://github.com/RecoHut-Datasets/sample_session/raw/v2/test.txt
  return array(a, dtype, copy=False, order=order)
