In [3]:
from pathlib import Path
import pickle
import warnings

import numpy as np
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import EncoderNormalizer, GroupNormalizer, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.metrics import NormalDistributionLoss
from pytorch_forecasting.models.deepar import DeepAR

warnings.simplefilter("error", category=SettingWithCopyWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_parquet('../../generic-buy-now-pay-later-project-group-40/data/curated/weighted_transactions.parquet', )
data['order_datetime'] = pd.to_datetime(data['order_datetime'])

# Transformations

Add 0 values for days on which merchants had no transactions

In [3]:
time_steps = data[['order_datetime']].drop_duplicates()
merchants = data[['merchant_abn']].drop_duplicates()
time_steps['key'] = 1
merchants['key'] = 1
merchant_time_steps = pd.merge(
    merchants,
    time_steps,
    on = 'key'
).drop('key', axis = 1)

data = pd.merge(
    data,
    merchant_time_steps,
    on = ['merchant_abn', 'order_datetime'],
    how = 'outer'
).fillna(0)

Add time_idx column

In [4]:
data['time_idx'] = (
    (
        data['order_datetime'].sort_values() - data['order_datetime'].min()
    )/np.timedelta64(1, 'D')
).astype(int)

Add monthofyear and dayofweek columns

In [5]:
data['week_of_year'] = data['order_datetime'].dt.isocalendar().week.astype(str)
data['day_of_week'] = data['order_datetime'].dt.dayofweek.astype(str)

Randomly select 100 merchants to test on

In [6]:
selected_merchants = data['merchant_abn'].sample(100, random_state = 100)
data = data[data['merchant_abn'].isin(selected_merchants)]

# Model

In [7]:
max_prediction_length = 365
max_encoder_length = int(data['time_idx'].max())
#training_cutoff = data["time_idx"].max() - max_prediction_length

In [18]:


training = TimeSeriesDataSet(
    data,
    time_idx="time_idx",
    target="weighted_dollar_value",
    group_ids=["merchant_abn"],
    min_encoder_length=max_encoder_length//2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    time_varying_known_categoricals = ["day_of_week", "week_of_year"],
    time_varying_unknown_reals=["weighted_dollar_value"],
    time_varying_known_reals=["time_idx"],
    target_normalizer=GroupNormalizer(groups=["merchant_abn"]),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    randomize_length=None,
)

"""validation = TimeSeriesDataSet.from_dataset(
    training,
    data,
    predict=True,
    stop_randomization=True
)"""
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=8)
"""val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)"""

# save datasets
training.save("training.pkl")
"""validation.save("validation.pkl")"""

'validation.save("validation.pkl")'

In [24]:
early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=1e-4, patience=5, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=1,
    gradient_clip_val=0.1,
    limit_train_batches=30,
    #limit_val_batches=3,
    # fast_dev_run=True,
    # logger=logger,
    # profiler=True,
    callbacks=[lr_logger, early_stop_callback],
    accelerator = 'cpu'
)


deepar = DeepAR.from_dataset(
    training,
    learning_rate=0.1,
    hidden_size=32,
    dropout=0.1,
    loss=NormalDistributionLoss(),
    log_interval=10,
    #log_val_interval=3,
    # reduce_on_plateau_patience=3,
)
print(f"Number of parameters in network: {deepar.size()/1e3:.1f}k")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Number of parameters in network: 17.0k


In [25]:
torch.set_num_threads(10)
trainer.fit(
    deepar,
    train_dataloaders=train_dataloader,
    #val_dataloaders=val_dataloader,
)


  | Name                   | Type                   | Params
------------------------------------------------------------------
0 | loss                   | NormalDistributionLoss | 0     
1 | logging_metrics        | ModuleList             | 0     
2 | embeddings             | MultiEmbedding         | 815   
3 | rnn                    | LSTM                   | 16.1 K
4 | distribution_projector | Linear                 | 66    
------------------------------------------------------------------
17.0 K    Trainable params
0         Non-trainable params
17.0 K    Total params
0.068     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 30/30 [03:44<00:00,  7.50s/it, loss=0.834, v_num=4, train_loss_step=0.652, train_loss_epoch=0.873]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 30/30 [03:44<00:00,  7.50s/it, loss=0.834, v_num=4, train_loss_step=0.652, train_loss_epoch=0.873]


### Generate data to predict next 12 months

In [44]:
encoder_data = data[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]

last_data = data[lambda x: x.time_idx == x.time_idx.max()]

decoder_data = pd.concat(
    [last_data.assign(order_datetime=lambda x: x.order_datetime + pd.offsets.Day(i)) for i in range(1, max_prediction_length + 1)],
    ignore_index=True,
)

# add time index consistent with "data"
decoder_data['time_idx'] = (
    (
        decoder_data['order_datetime'].sort_values() - decoder_data['order_datetime'].min()
    )/np.timedelta64(1, 'D')
).astype(int)

decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()

# adjust additional time feature(s)
decoder_data['week_of_year'] = decoder_data['order_datetime'].dt.isocalendar().week.astype(str)
decoder_data['day_of_week'] = decoder_data['order_datetime'].dt.dayofweek.astype(str)
# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

### Plot 10 example merchant predictions

In [None]:
new_raw_predictions, new_x = deepar.predict(new_prediction_data, mode="raw", return_x=True)

for idx in range(10):  # plot 10 examples
    deepar.plot_prediction(new_x, new_raw_predictions, idx=idx, show_future_observed=False);

### Convert output to df and write to disk

In [None]:
predictions, index = deepar.predict(new_raw_predictions, mode="prediction", return_index=True)

predictions_df = pd.DataFrame(
    predictions.numpy()
).reset_index().melt(
    id_vars = 'index', var_name = 'rel_time_idx'
)

index_df = index.reset_index()

predictions_df = pd.merge(
    index_df,
    predictions_df,
    on = 'index'
)

predictions_df['time_idx'] = predictions_df['time_idx'] + predictions_df['rel_time_idx']

predictions_df = predictions_df.rename(columns = {'index' : 'merchant_idx'}).drop('rel_time_idx', axis = 1)

In [None]:
predictions_df.to_parquet('../data/curated/transaction_predictions.parquet')