In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Getting Started Dressipi: Download and Convert

## Dressipi
The [Dressipi](https://www.dressipi-recsys2022.com/) dataset was used in RecSys22 challenge. The dataset contains 1.1 M online retail sessions that resulted in a purchase. Although the dataset does not give any information about the users it provides details about items that were viewed in a session, the item purchased at the end of the session and numerous features of those items. The task of this competition was to predict, given a sequence of items predict which item will be purchased at the end of a session. To learn more about the dataset go [here](http://www.recsyschallenge.com/2022/dataset.html).   


<img src="images/dressipi.jpeg" alt="dressipi_dataset" style="width: 400px; float: center;">  


## Download the dataset

In [28]:
# They are yet to make the dataset open to the public
# For now you can register in the competition to download the dataset

# Currently I'm assuming the downloaded data is in the same folder as this notebook

In [1]:
import os
import cudf
import dask_cudf
import pandas as pd 

import nvtabular as nvt
from merlin.dag import ColumnSelector
from merlin.io import Dataset
from merlin.schema import Schema, Tags
from nvtabular.ops import (
    AddMetadata,
)
from merlin.schema.tags import Tags

DATA_FOLDER = 'dressipi'
OUTPUT_FOLDER = 'dressipi_processed'
DATETIME_CONVERTION = 'ms'

## Data Preprocessing

In [2]:
# filter out categories with features coverage more than 80%, this helped in keeping quality features
def process_item_features(DATA_FOLDER, OUTPUT_FOLDER, category_coverage_min=0.8):
    df = pd.read_csv(os.path.join(DATA_FOLDER, 'item_features.csv'))
    tmp = df.feature_category_id.value_counts()/df.item_id.nunique()
    categories_to_keep = [3,  4,  5, 17, 24, 30, 45, 46, 53, 55, 58, 63, 65, 73]
    categories_to_keep = list(set(categories_to_keep + tmp[tmp>=category_coverage_min].index.tolist()))
    df = df[df.feature_category_id.isin(categories_to_keep)]
    df = df[~df.feature_category_id.isin([[30, 4, 46, 28, 53, 1]])]
    df = df.pivot_table('feature_value_id', ['item_id'], 'feature_category_id').reset_index()
    df.columns = [str(col) for col in df.columns]
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    df.to_csv(os.path.join(OUTPUT_FOLDER, f'selected_item_features.csv'), index=0)
    return df

# add timestamp and day features
def process_date_column(ddf):
    ddf['date'] = ddf['date'].astype(f'datetime64[{DATETIME_CONVERTION}]')
    ddf['timestamp'] = ddf['date'].astype('int64')
    ddf = ddf.sort_values(['session_id', 'date']).reset_index(drop=True)
    ddf['day'] = (ddf['date'] - ddf['date'].min()).dt.days
    return ddf

In [3]:
def get_preprocessed_data():
    # get the item features
    item_features = cudf.from_pandas(process_item_features(DATA_FOLDER, OUTPUT_FOLDER))

    # load data
    sessions = cudf.read_csv(os.path.join(DATA_FOLDER, 'train_sessions.csv'))
    purchases = cudf.read_csv(os.path.join(DATA_FOLDER, 'train_purchases.csv'))
    test_lb = cudf.read_csv(os.path.join(DATA_FOLDER, "test_leaderboard_sessions.csv"))
    test_final = cudf.read_csv(os.path.join(DATA_FOLDER, "test_final_sessions.csv"))

    # merge session data with item features 
    sessions = cudf.merge(sessions, item_features, on='item_id', how='left')
    purchases = cudf.merge(purchases, item_features, on='item_id', how='left')
    test_lb = cudf.merge(test_lb, item_features, on='item_id', how='left')
    test_final = cudf.merge(test_final, item_features, on='item_id', how='left')

    # add timestamp and day features, and convert the format of date to ms
    sessions = process_date_column(sessions)
    purchases = process_date_column(purchases)
    test_lb = process_date_column(test_lb)
    test_final = process_date_column(test_final)

    # Split into train and validation set
    train_session = sessions.loc[sessions.day <= (sessions.day.max()-30) ].copy().reset_index(drop=True)
    valid_session = sessions.loc[sessions.day > (sessions.day.max()-30) ].copy().reset_index(drop=True)
    train_session.shape, valid_session.shape

    # Merge with train + valid purchases
    valid_purchases = purchases[purchases.session_id.isin(valid_session.session_id.unique().values.tolist())]
    train_purchases = purchases[purchases.session_id.isin(train_session.session_id.unique().values.tolist())]
    train = cudf.concat([train_session, train_purchases])
    valid = cudf.concat([valid_session, valid_purchases])

    return train, valid, item_features, test_lb, test_final, sessions


train, valid, item_features, test_lb, test_final, sessions = get_preprocessed_data()

In [4]:
train.shape, valid.shape

((5206304, 27), (537516, 27))

## Feature Engineering with NVTabular

### Categorify

In [6]:
item_features_names = item_features.columns[1:].tolist()
cat_features = ['session_id', 'item_id'] + item_features_names >> nvt.ops.Categorify()
all_data = dask_cudf.concat([Dataset(sessions).to_ddf(), Dataset(test_lb).to_ddf(), Dataset(test_final).to_ddf()])

features = ['timestamp','date'] + cat_features
dataset = Dataset(all_data)
workflow0 = nvt.Workflow(features)
workflow0.fit(dataset)

# transform data
train_0 = workflow0.transform(Dataset(train))
valid_0 = workflow0.transform(Dataset(valid))
test_lb_0 = workflow0.transform(Dataset(test_lb))
test_final_0 = workflow0.transform(Dataset(test_final))



### GroupBy

In [7]:
features = train_0.head().columns.tolist()
print(features)

['session_id', 'item_id', '3', '4', '5', '7', '17', '24', '30', '45', '46', '47', '50', '53', '55', '56', '58', '61', '63', '65', '68', '69', '72', '73', 'timestamp', 'date']


In [8]:
# Define Groupby Operator
to_aggregate = {
    'date': ["first", "last"],
    'item_id': ["list", "first", "last"],
    'timestamp': ["list"]
}
for name in item_features_names: 
    to_aggregate[name] = ['list']
    
groupby_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["date"],
    aggs= to_aggregate,
    name_sep="_")

# Add tags needed for the t4rec models definition
item_list = groupby_features['item_id_list'] >> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE, Tags.ITEM, Tags.ITEM_ID, Tags.LIST])
item_first_last = groupby_features['item_id_last', 'item_id_first'] >> nvt.ops.AddMetadata(tags=[Tags.ITEM, Tags.ITEM_ID])
feature_list = groupby_features[[name+'_list' for name in item_features_names]]>> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE, Tags.ITEM, Tags.LIST])
other_features = groupby_features['session_id', 'date_first', 'date_last','timestamp_list']

In [9]:
workflow1 = nvt.Workflow(item_list + item_first_last + feature_list + other_features)
all_data = dask_cudf.concat([train_0.to_ddf(),valid_0.to_ddf(),test_lb_0.to_ddf(),test_final_0.to_ddf()])
workflow1.fit(train_0)
# workflow1.fit(Dataset(all_data))

# transform data
train_1 = workflow1.transform(train_0)
valid_1 = workflow1.transform(valid_0)
test_lb_1 = workflow1.transform(test_lb_0)
test_final_1 = workflow1.transform(test_final_0)

### Truncate and Padding for a Maximum Sequence Length

In [10]:
SESSIONS_MAX_LENGTH = 20
list_cols = [col for col in train_1.head().columns if 'list' in col and 'date' not in col]
truncated_fatures = list_cols >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH, pad=True) >> nvt.ops.Rename(postfix = '_seq')
final_features = [
    'session_id', 'date_first', 'date_last', 'item_id_list', 'item_id_first', 
]

### Add TARGET tag in label column

In [11]:
LABEL_COLUMNS = ['item_id_last']
label_features = LABEL_COLUMNS >> AddMetadata(tags=[str(Tags.BINARY_CLASSIFICATION), "target"])

In [12]:
workflow2 = nvt.Workflow(final_features + truncated_fatures+label_features)
workflow2.fit(train_1)

# transform data
train_2 = workflow2.transform(train_1)
valid_2 = workflow2.transform(valid_1)
test_lb_2 = workflow2.transform(test_lb_1)
test_final_2 = workflow2.transform(test_final_1)

## Save processed data to Parquet files

In [13]:
train_ds = Dataset(train_2.to_ddf().sort_values('date_last'), schema=train_2.schema)
valid_ds = Dataset(valid_2.to_ddf().sort_values('date_last'), schema=valid_2.schema)
test_lb_ds = Dataset(test_lb_2.to_ddf().sort_values('date_last'), schema=test_lb_2.schema)
test_final_ds = Dataset(test_final_2.to_ddf().sort_values('date_last'), schema=test_final_2.schema)

In [14]:
train_ds.compute()

Unnamed: 0,item_id_list_seq,3_list_seq,4_list_seq,5_list_seq,7_list_seq,17_list_seq,24_list_seq,30_list_seq,45_list_seq,46_list_seq,...,69_list_seq,72_list_seq,73_list_seq,timestamp_list_seq,session_id,date_first,date_last,item_id_list,item_id_first,item_id_last
915007,"[5056, 14302, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1577836848505, 1577836931060, 0, 0, 0, 0, 0, ...",1093101,2020-01-01 00:00:48.505,2020-01-01 00:02:11.060,"[5056, 14302]",5056,14302
508121,"[16047, 16047, 16380, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[43, 43, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1577837388157, 1577837392394, 1577837518940, ...",610982,2020-01-01 00:09:48.157,2020-01-01 00:11:58.940,"[16047, 16047, 16380]",16047,16380
744322,"[917, 16143, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1577837555593, 1577837610429, 0, 0, 0, 0, 0, ...",889862,2020-01-01 00:12:35.593,2020-01-01 00:13:30.429,"[917, 16143]",917,16143
637589,"[10895, 15154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1577837569201, 1577837610853, 0, 0, 0, 0, 0, ...",763091,2020-01-01 00:12:49.201,2020-01-01 00:13:30.853,"[10895, 15154]",10895,15154
447194,"[5937, 3478, 11637, 6516, 0, 0, 0, 0, 0, 0, 0,...","[1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 1, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[8, 2, 10, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[17, 17, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1, 16, 1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 1, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1577837545450, 1577837571259, 1577837656096, ...",539081,2020-01-01 00:12:25.450,2020-01-01 00:14:26.055,"[5937, 3478, 11637, 6516]",5937,6516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142124,"[4368, 14, 112, 1632, 25, 225, 25, 25, 7, 0, 0...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 6, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[1619899504291, 1619899524945, 1619899574219, ...",174811,2021-05-01 20:05:04.291,2021-05-01 23:31:55.899,"[4368, 14, 112, 1632, 25, 225, 25, 25, 7]",4368,7
590158,"[7140, 7140, 3644, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1619912079113, 1619912105158, 1619912151676, ...",707140,2021-05-01 23:34:39.113,2021-05-01 23:35:51.676,"[7140, 7140, 3644]",7140,3644
585327,"[11213, 8, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[15, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[1, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1619910407177, 1619912353536, 1619912369987, ...",701427,2021-05-01 23:06:47.177,2021-05-01 23:39:29.987,"[11213, 8, 38]",11213,38
111221,"[13194, 373, 99, 8, 99, 188, 436, 4194, 166, 2...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 6, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, ...","[5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[1619910867805, 1619911351299, 1619911629837, ...",137175,2021-05-01 23:14:27.805,2021-05-01 23:41:19.131,"[13194, 373, 99, 8, 99, 188, 436, 4194, 166, 20]",13194,20


In [15]:
%%time

train_ds.to_parquet(os.path.join(OUTPUT_FOLDER, "train/"), output_files=10)
valid_ds.to_parquet(os.path.join(OUTPUT_FOLDER, "valid/"), output_files=10)
test_lb_ds.to_parquet(os.path.join(OUTPUT_FOLDER, "test_leaderboard/"), output_files=10)
test_final_ds.to_parquet(os.path.join(OUTPUT_FOLDER, "test_final/"), output_files=10)

CPU times: user 36.2 s, sys: 35.7 s, total: 1min 11s
Wall time: 2min 51s
