In [None]:
# default_exp datasets.weeplaces

# Weeplaces
> Implementation of Weeplaces dataset.

This dataset is collected from Weeplaces, a website that aims to visualize users’ check-in activities in location-based social networks (LBSN). It is now integrated with the APIs of other location-based social networking services, e.g., Facebook Places, Foursquare, and Gowalla. Users can login Weeplaces using their LBSN accounts and connect with their friends in the same LBSN who have also used this application. All the crawled data is originally generated in Foursquare. This dataset contains 7,658,368 check-ins generated by 15,799 users over 971,309 locations. In the data collection, we can’t get the original Foursquare IDs of the Weeplaces users. We can only get their check-in history, their friends who also use Weeplaces, and other additional information about the locations.

You can download this dataset from **[here](https://drive.google.com/file/d/0BzpKyxX1dqTYYzRmUXRZMWloblU/view?usp=sharing)** (about 140 MB). Note that this dataset is released solely for research purpose.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import normalize

import torch
from torch.utils import data

import recohut
from recohut.datasets.bases import common as base
from recohut.utils.common_utils import download_url, extract_zip

In [None]:
#export
class WeeplacesDataset(base.Dataset, data.Dataset):
    url = "https://github.com/RecoHut-Datasets/weeplaces/raw/v2/data.zip"

    def __init__(self, root, datatype='train', is_group=False, n_items=None, negs_per_group=None, padding_idx=None, verbose=True):
        super().__init__(root)
        self.datatype = datatype
        self.n_items = n_items
        self.negs_per_group = negs_per_group
        self.is_group = is_group
        self.padding_idx = padding_idx
        if is_group:
            if datatype=='train':
                self.user_data = self.load_user_data_train()
                self.group_data, self.group_users = self.load_group_data_train()
                self.group_inputs = [self.user_data[self.group_users[g]] for g in self.groups_list]
            else:
                self.eval_groups_list = []
                self.user_data = self.load_user_data_tr_te(datatype)
                self.eval_group_data, self.eval_group_users = self.load_group_data_tr_te(datatype)
        else:
            if datatype=='train':
                self.train_data_ui = self.load_ui_train()
                self.user_list = list(range(self.n_users))
            else:
                self.data_tr, self.data_te = self.load_ui_tr_te(datatype)
    
    def __len__(self):
        if self.is_group:
            if self.datatype=='train':
                return len(self.groups_list)
            return len(self.eval_groups_list)
        return len(self.user_list)

    def __train__(self, index):
        """ load user_id, binary vector over items """
        user = self.user_list[index]
        user_items = torch.from_numpy(self.train_data_ui[user, :].toarray()).squeeze()  # [I]
        return torch.from_numpy(np.array([user], dtype=np.int32)), user_items

    def __test__(self, index):
        """ load user_id, fold-in items, held-out items """
        user = self.user_list[index]
        fold_in, held_out = self.data_tr[user, :].toarray(), self.data_te[user, :].toarray()  # [I], [I]
        return user, torch.from_numpy(fold_in).squeeze(), held_out.squeeze()  # user, fold-in items, fold-out items.

    def __train_group__(self, index):
        """ load group_id, padded group users, mask, group items, group member items, negative user items """
        group = self.groups_list[index]
        user_ids = torch.from_numpy(np.array(self.group_users[group], np.int32))  # [G] group member ids
        group_items = torch.from_numpy(self.group_data[group].toarray().squeeze())  # [I] items per group

        corrupted_group = self.get_corrupted_users(group)  # [# negs]
        corrupted_user_items = torch.from_numpy(self.user_data[corrupted_group].toarray().squeeze())  # [# negs, I]

        # group mask to create fixed-size padded groups.
        group_length = self.max_group_size - list(user_ids).count(self.padding_idx)
        group_mask = torch.from_numpy(np.concatenate([np.zeros(group_length, dtype=np.float32), (-1) * np.inf *
                                                      np.ones(self.max_group_size - group_length,
                                                              dtype=np.float32)]))  # [G]

        user_items = torch.from_numpy(self.group_inputs[group].toarray())  # [G, |I|] group member items

        return torch.tensor([group]), user_ids, group_mask, group_items, user_items, corrupted_user_items

    def __test_group__(self, index):
        """ load group_id, padded group users, mask, group items, group member items """
        group = self.eval_groups_list[index]
        user_ids = self.eval_group_users[group]  # [G]
        length = self.max_gsize - list(user_ids).count(self.padding_idx)
        mask = torch.from_numpy(np.concatenate([np.zeros(length, dtype=np.float32), (-1) * np.inf *
                                                np.ones(self.max_gsize - length, dtype=np.float32)]))  # [G]
        group_items = torch.from_numpy(self.eval_group_data[group].toarray().squeeze())  # [I]
        user_items = torch.from_numpy(self.user_data[user_ids].toarray().squeeze())  # [G, I]

        return torch.tensor([group]), torch.tensor(user_ids), mask, group_items, user_items

    def __getitem__(self, index):
        if self.is_group:
            if self.datatype=='train':
                return self.__train_group__(index)
            return self.__test_group__(index)
        else:
            if self.datatype=='train':
                return self.__train__(index)
            return self.__test__(index)

    @property
    def raw_file_names(self) -> str:
        return ['train_ui.csv',
                'val_ui_te.csv',
                'group_users.csv',
                'data.zip',
                'train_gi.csv',
                'test_ui_tr.csv',
                'val_ui_tr.csv',
                'test_ui_te.csv',
                'val_gi.csv',
                'test_gi.csv']

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)

    @property
    def processed_file_names(self) -> str:
        pass

    def process(self):
        pass

    def load_ui_train(self):
        """ load training user-item interactions as a sparse matrix """
        path_ui = [p for p in self.raw_paths if "train_ui" in p][0]
        df_ui = pd.read_csv(path_ui)
        self.n_users, self.n_items = df_ui['user'].max() + 1, df_ui['item'].max() + 1
        rows_ui, cols_ui = df_ui['user'], df_ui['item']
        data_ui = sp.csr_matrix((np.ones_like(rows_ui), (rows_ui, cols_ui)), dtype='float32',
                                shape=(self.n_users, self.n_items))  # [# train users, I] sparse matrix
        print("# train users", self.n_users, "# items", self.n_items)
        return data_ui

    def load_ui_tr_te(self, datatype='val'):
        """ load user-item interactions of val/test user sets as two sparse matrices of fold-in and held-out items """
        ui_tr_path = [p for p in self.raw_paths if '{}_ui_tr.csv'.format(datatype) in p][0]

        ui_te_path = [p for p in self.raw_paths if '{}_ui_te.csv'.format(datatype) in p][0]

        ui_df_tr, ui_df_te = pd.read_csv(ui_tr_path), pd.read_csv(ui_te_path)

        start_idx = min(ui_df_tr['user'].min(), ui_df_te['user'].min())
        end_idx = max(ui_df_tr['user'].max(), ui_df_te['user'].max())

        rows_tr, cols_tr = ui_df_tr['user'] - start_idx, ui_df_tr['item']
        rows_te, cols_te = ui_df_te['user'] - start_idx, ui_df_te['item']
        self.user_list = list(range(0, end_idx - start_idx + 1))

        ui_data_tr = sp.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype='float32',
                                   shape=(end_idx - start_idx + 1, self.n_items))  # [# eval users, I] sparse matrix
        ui_data_te = sp.csr_matrix((np.ones_like(rows_te), (rows_te, cols_te)), dtype='float32',
                                   shape=(end_idx - start_idx + 1, self.n_items))  # [# eval users, I] sparse matrix
        return ui_data_tr, ui_data_te

    def get_corrupted_users(self, group):
        """ negative user sampling per group (eta balances item-biased and random sampling) """
        eta = 0.5
        p = np.ones(self.n_users + 1)
        p[self.group_users[group]] = 0
        p = normalize([p], norm='l1')[0]
        item_biased = normalize(self.user_data[:, self.group_data[group].indices].sum(1).squeeze(), norm='l1')[0]
        p = eta * item_biased + (1 - eta) * p
        negative_users = torch.multinomial(torch.from_numpy(p), self.negs_per_group)
        return negative_users

    def load_user_data_train(self):
        """ load user-item interactions of all users that appear in training groups, as a sparse matrix """
        df_ui = pd.DataFrame()
        train_path_ui = [p for p in self.raw_paths if 'train_ui.csv' in p][0]
        df_train_ui = pd.read_csv(train_path_ui)
        df_ui = df_ui.append(df_train_ui)

        # include users from the (fold-in item set) of validation and test sets of user-item data.
        val_path_ui = [p for p in self.raw_paths if 'val_ui_tr.csv' in p][0]
        df_val_ui = pd.read_csv(val_path_ui)
        df_ui = df_ui.append(df_val_ui)

        test_path_ui = [p for p in self.raw_paths if 'test_ui_tr.csv' in p][0]
        df_test_ui = pd.read_csv(test_path_ui)
        df_ui = df_ui.append(df_test_ui)

        self.n_users = df_ui['user'].max() + 1
        self.padding_idx = self.n_users  # padding idx for user when creating groups of fixed size.
        assert self.n_items == df_ui['item'].max() + 1
        rows_ui, cols_ui = df_ui['user'], df_ui['item']

        data_ui = sp.csr_matrix((np.ones_like(rows_ui), (rows_ui, cols_ui)), dtype='float32',
                                shape=(self.n_users + 1, self.n_items))  # [U, I] sparse matrix
        return data_ui

    def load_user_data_tr_te(self, datatype):
        """ load all user-item interactions of users that occur in val/test groups, as a sparse matrix """
        df_ui = pd.DataFrame()
        train_path_ui = [p for p in self.raw_paths if 'train_ui.csv' in p][0]
        df_train_ui = pd.read_csv(train_path_ui)
        df_ui = df_ui.append(df_train_ui)

        val_path_ui = [p for p in self.raw_paths if 'val_ui_tr.csv' in p][0]
        df_val_ui = pd.read_csv(val_path_ui)
        df_ui = df_ui.append(df_val_ui)

        if datatype == 'val' or datatype == 'test':
            # include eval user set (tr) items (since they might occur in evaluation set)
            test_path_ui = [p for p in self.raw_paths if 'test_ui_tr.csv' in p][0]
            df_test_ui = pd.read_csv(test_path_ui)
            df_ui = df_ui.append(df_test_ui)

        n_users = df_ui['user'].max() + 1
        assert self.n_items == df_ui['item'].max() + 1
        rows_ui, cols_ui = df_ui['user'], df_ui['item']
        data_ui = sp.csr_matrix((np.ones_like(rows_ui), (rows_ui, cols_ui)), dtype='float32',
                                shape=(n_users + 1, self.n_items))  # [# users, I] sparse matrix
        return data_ui

    def load_group_data_train(self):
        """ load training group-item interactions as a sparse matrix and user-group memberships """
        path_ug = [p for p in self.raw_paths if 'group_users.csv' in p][0]
        path_gi = [p for p in self.raw_paths if 'train_gi.csv' in p][0]

        df_gi = pd.read_csv(path_gi)  # load training group-item interactions.
        start_idx, end_idx = df_gi['group'].min(), df_gi['group'].max()
        self.n_groups = end_idx - start_idx + 1
        rows_gi, cols_gi = df_gi['group'] - start_idx, df_gi['item']

        data_gi = sp.csr_matrix((np.ones_like(rows_gi), (rows_gi, cols_gi)), dtype='float32',
                                shape=(self.n_groups, self.n_items))  # [# groups,  I] sparse matrix.

        df_ug = pd.read_csv(path_ug).astype(int)  # load user-group memberships.
        df_ug_train = df_ug[df_ug.group.isin(range(start_idx, end_idx + 1))]
        df_ug_train = df_ug_train.sort_values('group')  # sort in ascending order of group ids.
        self.max_group_size = df_ug_train.groupby('group').size().max()  # max group size denoted by G

        g_u_list_train = df_ug_train.groupby('group')['user'].apply(list).reset_index()
        g_u_list_train['user'] = list(map(lambda x: x + [self.padding_idx] * (self.max_group_size - len(x)),
                                          g_u_list_train.user))
        data_gu = np.squeeze(np.array(g_u_list_train[['user']].values.tolist()))  # [# groups, G] with padding.
        self.groups_list = list(range(0, end_idx - start_idx + 1))

        assert len(df_ug_train['group'].unique()) == self.n_groups
        print("# training groups: {}, # max train group size: {}".format(self.n_groups, self.max_group_size))

        return data_gi, data_gu

    def load_group_data_tr_te(self, datatype):
        """ load val/test group-item interactions as a sparse matrix and user-group memberships """
        path_ug = [p for p in self.raw_paths if 'group_users.csv' in p][0]
        path_gi = [p for p in self.raw_paths if '{}_gi.csv'.format(datatype) in p][0]

        df_gi = pd.read_csv(path_gi)  # load group-item interactions
        start_idx, end_idx = df_gi['group'].min(), df_gi['group'].max()
        self.n_groups = end_idx - start_idx + 1
        rows_gi, cols_gi = df_gi['group'] - start_idx, df_gi['item']
        data_gi = sp.csr_matrix((np.ones_like(rows_gi), (rows_gi, cols_gi)), dtype='float32',
                                shape=(self.n_groups, self.n_items))  # [# eval groups, I] sparse matrix

        df_ug = pd.read_csv(path_ug)  # load user-group memberships
        df_ug_eval = df_ug[df_ug.group.isin(range(start_idx, end_idx + 1))]
        df_ug_eval = df_ug_eval.sort_values('group')  # sort in ascending order of group ids
        self.max_gsize = df_ug_eval.groupby('group').size().max()  # max group size denoted by G
        g_u_list_eval = df_ug_eval.groupby('group')['user'].apply(list).reset_index()
        g_u_list_eval['user'] = list(map(lambda x: x + [self.padding_idx] * (self.max_gsize - len(x)),
                                         g_u_list_eval.user))
        data_gu = np.squeeze(np.array(g_u_list_eval[['user']].values.tolist(), dtype=np.int32))  # [# groups, G]
        self.eval_groups_list = list(range(0, end_idx - start_idx + 1))
        return data_gi, data_gu

In [None]:
root = './data'

# Define train/val/test datasets on user interactions.
train_dataset = WeeplacesDataset(root, is_group=False, datatype='train')  # train dataset for user-item interactions.
n_users, n_items = train_dataset.n_users, train_dataset.n_items
val_dataset = WeeplacesDataset(root, is_group=False, datatype='val', n_items=n_items)
test_dataset = WeeplacesDataset(root, is_group=False, datatype='test', n_items=n_items)

# Define train/val/test datasets on group and user interactions.
train_group_dataset = WeeplacesDataset(root, is_group=True, datatype='train', negs_per_group=5, n_items=n_items)
padding_idx = train_group_dataset.padding_idx
val_group_dataset = WeeplacesDataset(root, is_group=True, datatype='val', n_items=n_items, padding_idx=padding_idx)
test_group_dataset = WeeplacesDataset(root, is_group=True, datatype='test', n_items=n_items, padding_idx=padding_idx)

Downloading https://github.com/RecoHut-Datasets/weeplaces/raw/v2/data.zip
Extracting ./data/raw/data.zip


# train users 6050 # items 25081
# training groups: 15913, # max train group size: 22


In [None]:
!tree --du -h -C ./data

[01;34m./data[00m
└── [9.1M]  [01;34mraw[00m
    ├── [2.5M]  [01;31mdata.zip[00m
    ├── [794K]  group_users.csv
    ├── [156K]  test_gi.csv
    ├── [433K]  test_ui_te.csv
    ├── [635K]  test_ui_tr.csv
    ├── [498K]  train_gi.csv
    ├── [3.5M]  train_ui.csv
    ├── [ 72K]  val_gi.csv
    ├── [205K]  val_ui_te.csv
    └── [300K]  val_ui_tr.csv

 9.1M used in 1 directory, 10 files


> References
1. https://github.com/RecoHut-Stanzas/S168471/blob/main/reports/S168471_report.ipynb

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-29 14:11:18

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
scipy  : 1.4.1
recohut: 0.0.8
torch  : 1.10.0+cu111
numpy  : 1.19.5
pandas : 1.1.5

