In [None]:
# default_exp transforms.splitting

# Splitting
> Data Splitting Transforms.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np

## Split by Ratio

In [None]:
#exporti
def groupby_user(user_indices):
    users, user_position, user_counts = np.unique(user_indices,
                                                  return_inverse=True,
                                                  return_counts=True)
    user_split_indices = np.split(np.argsort(user_position, kind="mergesort"),
                                  np.cumsum(user_counts)[:-1])
    return user_split_indices


def _pad_unknown_item(data_list):
    train_data, test_data = data_list
    n_items = train_data.item.nunique()
    unique_items = set(train_data.item.tolist())
    test_data.loc[~test_data.item.isin(unique_items), "item"] = n_items
    return train_data, test_data


def _filter_unknown_user_item(data_list):
    train_data, test_data = data_list
    unique_values = dict(user=set(train_data.user.tolist()),
                         item=set(train_data.item.tolist()))

    print(f"test data size before filtering: {len(test_data)}")
    out_of_bounds_row_indices = set()
    for col in ["user", "item"]:
        for j, val in enumerate(test_data[col]):
            if val not in unique_values[col]:
                out_of_bounds_row_indices.add(j)

    mask = np.arange(len(test_data))
    test_data_clean = test_data[~np.isin(mask, list(out_of_bounds_row_indices))]
    print(f"test data size after filtering: {len(test_data_clean)}")
    return train_data, test_data_clean

In [None]:
#export
def split_by_ratio(data, shuffle=False, test_size=None, pad_unknown=True,
                   filter_unknown=False, seed=42):
    np.random.seed(seed)
    data = data.copy()
    n_users = data.user.nunique()
    user_indices = data.user.to_numpy()
    user_split_indices = groupby_user(user_indices)

    split_indices_all = [[], []]
    for u in range(n_users):
        u_data = user_split_indices[u]
        u_data_len = len(u_data)
        if u_data_len <= 3:   # keep items of rare users in trainset
            split_indices_all[0].extend(u_data)
        else:
            train_threshold = round((1 - test_size) * u_data_len)
            split_indices_all[0].extend(list(u_data[:train_threshold]))
            split_indices_all[1].extend(list(u_data[train_threshold:]))

    if shuffle:
        split_data_all = tuple(
            np.random.permutation(data[idx]) for idx in split_indices_all
        )
    else:
        split_data_all = list(data.iloc[idx] for idx in split_indices_all)

    if pad_unknown:
        split_data_all = _pad_unknown_item(split_data_all)
    elif filter_unknown:
        split_data_all = _filter_unknown_user_item(split_data_all)
    return split_data_all

In [None]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame.from_dict(
    {
        'user':[1,1,1,1,1,2,2,3],
        'item':[1,2,3,2,2,1,2,3]
    }
)

df

Unnamed: 0,user,item
0,1,1
1,1,2
2,1,3
3,1,2
4,1,2
5,2,1
6,2,2
7,3,3


In [None]:
train, test = split_by_ratio(df, shuffle=False, test_size=0.2, pad_unknown=True, filter_unknown=False)
print("train:\n{}\n\ntest:\n{}".format(train,test))

train:
   user  item
0     1     1
1     1     2
2     1     3
3     1     2
5     2     1
6     2     2
7     3     3

test:
   user  item
4     1     2


In [None]:
train, test = split_by_ratio(df, shuffle=False, test_size=0.4, pad_unknown=True, filter_unknown=True)
print("train:\n{}\n\ntest:\n{}".format(train,test))

train:
   user  item
0     1     1
1     1     2
2     1     3
5     2     1
6     2     2
7     3     3

test:
   user  item
3     1     2
4     1     2


## Last-session-out split

In [None]:
#exporti
def clean_split(train, test):
    """
    Remove new items from the test set.
    :param train: The training set.
    :param test: The test set.
    :return: The cleaned training and test sets.
    """
    train_items = set()
    train['sequence'].apply(lambda seq: train_items.update(set(seq)))
    test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items])
    return train, test

In [None]:
#export
def last_session_out_split(data,
                           user_key='user_id',
                           session_key='session_id',
                           time_key='ts'):
    """
    Assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    train, test = clean_split(train, test)
    return train, test

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict({
    'session_id': [357,359,394,4127,6400],
    'sequence': [[793, 3489],[1762],[1256],
                 [1948, 1364, 2060, 1115, 6488, 2060],
                 [687, 1394]],
    'ts': [1421003874, 1421018535, 1421007470,
           1421416896, 1420807778],
    'user_id': [4296, 4296, 30980, 28117, 35247]
})

df		

Unnamed: 0,session_id,sequence,ts,user_id
0,357,"[793, 3489]",1421003874,4296
1,359,[1762],1421018535,4296
2,394,[1256],1421007470,30980
3,4127,"[1948, 1364, 2060, 1115, 6488, 2060]",1421416896,28117
4,6400,"[687, 1394]",1420807778,35247


In [None]:
train_data, test_data = last_session_out_split(df)
train_data

Unnamed: 0,session_id,sequence,ts,user_id
0,357,"[793, 3489]",1421003874,4296


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-24 07:50:04

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
numpy  : 1.19.5

