In [None]:
# default_exp utils.transforms

# Transforms
> Implementation of data transformation utilities.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import List

import pandas as pd
import numpy as np

## transform_to_gru4rec_format

In [None]:
#export
def transform_to_gru4rec_format(dataset, 
                                lst_col='sequence', 
                                ts_col='timestamp', 
                                user_col='user_id',
                                ):
    """
    Convert a list of sequences to GRU4Rec format.
    Based on this StackOverflow answer: https://stackoverflow.com/a/48532692

    :param dataset: the dataset to be transformed
    """
    df = dataset.reset_index()
    unstacked = pd.DataFrame({
        col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col)}
    ).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
    # ensure that events in the session have increasing timestamps
    unstacked[ts_col] = unstacked[ts_col] + unstacked.groupby(user_col).cumcount()
    unstacked.rename(columns={lst_col: 'item_id'}, inplace=True)
    return unstacked

Example

In [None]:
from recohut.utils.data import load_dataset

df = load_dataset('sample_session')
df

Unnamed: 0,session_id,sequence,ts,user_id
0,357,"[793, 3489]",1421003874,4296
1,359,[1762],1421018535,4296
2,394,[1256],1421007470,30980
3,4127,"[1948, 1364, 2060, 1115, 6488, 2060]",1421416896,28117
4,6400,"[687, 1394]",1420807778,35247


In [None]:
transform_to_gru4rec_format(df, lst_col='sequence', ts_col='ts', user_col='user_id')

Unnamed: 0,index,session_id,item_id,ts,user_id
0,0,357,793,1421003874,4296
1,0,357,3489,1421003875,4296
2,1,359,1762,1421018537,4296
3,2,394,1256,1421007470,30980
4,3,4127,1948,1421416896,28117
5,3,4127,1364,1421416897,28117
6,3,4127,2060,1421416898,28117
7,3,4127,1115,1421416899,28117
8,3,4127,6488,1421416900,28117
9,3,4127,2060,1421416901,28117


## wide_to_long

In [None]:
#export
def wide_to_long(matrix: np.array, 
                 possible_ratings: List[int],
                 positive_only: bool = True,
                 user_col: str = 'user_id',
                 item_col: str = 'item_id',
                 feedback_col: str = 'feedback') -> np.array:
    """Go from wide table to long.

    Args:
        matrix: wide array with user-item interactions.
        possible_ratings: list of possible ratings that we may have.
        positive_only: whether to keep only positive interactions.
    """

    def _get_ratings(arr: np.array, rating: int) -> np.array:
        """Generate long array for the rating provided
        :param arr: wide array with user-item interactions
        :param rating: the rating that we are interested"""
        idx = np.where(arr == rating)
        return np.vstack(
            (idx[0], idx[1], np.ones(idx[0].size, dtype="int8") * rating)
        ).T

    long_arrays = []
    for r in possible_ratings:
        long_arrays.append(_get_ratings(matrix, r))

    interactions = np.vstack(long_arrays)

    _df = pd.DataFrame(interactions, columns=[user_col, item_col, feedback_col])

    if positive_only:
        _df = _df[_df[feedback_col]>0]
        
    _df.reset_index(drop=True, inplace=True)

    return _df

Example

In [None]:
matrix = np.array(
    [[1, 0, 1, 0, 1, 0, 1, 0, 0, 1],
    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 0, 0, 1, 0, 1, 1],
    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    [1, 0, 0, 1, 0, 0, 0, 0, 1, 0]]
)

matrix

array([[1, 0, 1, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0]])

In [None]:
wide_to_long(matrix, [0,1])

Unnamed: 0,user_id,item_id,feedback
0,0,0,1
1,0,2,1
2,0,4,1
3,0,6,1
4,0,9,1
5,1,0,1
6,5,0,1
7,5,7,1
8,5,8,1
9,6,3,1


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

numpy  1.19.5
pandas 1.1.5
Sparsh A. 
last updated: 2022-01-29 14:31:54 

recohut 0.0.12

compiler   : GCC 7.5.0
system     : Linux
release    : 5.4.144+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
