## Resample transactions

- the transaction data spans for multiple years where the 
    - Development set: Training and Validation; train from `2018 week-38` to `2020 Week-32`
    - Test set: From `2020 Week-33` to `2020 Week-39`
- Select last `n` transactions for each customer during this period to generate a smaller dataset.
- Use this dataset to generate negative examples for each customer

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
import _read_data_files_helper as data_helper

In [None]:
df_transactions = data_helper.read_transactions(Path(""))
n_recent_tx = 11

In [None]:
max_tr_date = df_transactions[(df_transactions.tx_year == 2020) & (df_transactions.tx_week == 28)].t_dat.max()
training_data = df_transactions[df_transactions.t_dat <= max_tr_date]

In [None]:
## rank based on recent transactions
training_data['rank_'] = training_data.groupby(['customer_id']).t_dat.rank(method='dense', ascending=False)

In [None]:
tr_final = training_data[training_data.rank_ < n_recent_tx].filter(['customer_id', 'article_id']).drop_duplicates()
tr_final['purchase'] = 1

In [None]:
tr_final.shape, training_data.shape, df_transactions.shape

In [None]:
tr_final.to_parquet("train_tx_last_n_active.parquet")

Temp solution to reduce training data

In [10]:
from tqdm import tqdm

In [11]:
s1 = "CF_model_input_neucf_S1"
s2 = "CF_model_input_neucf_S2"
s3 = "CF_model_input_neucf_S3"

In [12]:
path_ = Path("")

In [13]:
def take_subsample(dataset_):
    positives = dataset_[dataset_.purchase == 1]
    negatives = dataset_[dataset_.purchase == 0]

    groups_ = negatives.groupby(["customer_id"])

    df_list = []
    for grp in tqdm(groups_):
        df_list.append(
            [grp[0], grp[1].article_id.sample(n=5, 
                                              replace=False)]
     )

    test_ = pd.DataFrame(df_list, columns=["customer_id", "article_id"])
    df_list.clear()
    test_ = test_.explode(["article_id"])
    test_["purchase"] = 0

    assert int(test_.shape[0] / 5) == dataset_.customer_id.nunique()

    df_final = pd.concat([positives, test_]).astype("int32")


    return df_final

In [21]:
data_ = pd.read_parquet(path_/s3/'train_ds_stg_3.parquet')
subsample = take_subsample(data_)
assert subsample[subsample.purchase==0].groupby(['customer_id']).article_id.count().mean() == 5

100%|██████████| 1115557/1115557 [02:31<00:00, 7350.14it/s]


In [22]:
subsample.to_parquet(path_/s1/'train_ds_sample_stg_3.parquet')