In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomColesValidationDataset, CustomCoLES
from src.local_validation import LocalValidationModel

# Churn dataset with real local targets

In [3]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [4]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,5200,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,6011,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,5921,2017-12-05 00:00:00,767.0,0,0,0,1
3,0,5411,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,6012,2017-10-24 13:14:24,36562.0,0,0,0,0


In [5]:
local_target = cfg_model["validation_dataset"]["local_target_col"]

preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp", # no time preprocessing
    cols_category=["mcc_code"],
    cols_numerical=["amount", local_target], # keep column with fake local targets
    return_records=True
)

dataset = preprocessor.fit_transform(df)

train, val_test = train_test_split(dataset, test_size=.2, random_state=42)
val, test = train_test_split(val_test, test_size=.5, random_state=42)

In [6]:
# initialize original CoLES datasest - for CoLES training
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)
    
train_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [7]:
model_churn: CustomCoLES = instantiate(cfg_model["model"])

In [8]:
model_churn.load_state_dict(torch.load("saved_models/coles_churn_default.pth"))

<All keys matched successfully>

## Without LSTM

In [9]:
# initialize custom datasets and datamodule for local validation
# use the same 'train' and 'val' preprocessed data
train_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=train)
val_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=val)
test_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=test)

# keep batch_size = 1 (all slices of one user in one batch)
# or may use batch_size > 1 to speed-up LocalValidationModel training
val_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data_local,
    valid_data=val_data_local,
    test_data=test_data_local,
    train_batch_size=1,
    valid_batch_size=1,
    test_batch_size=1
)

In [10]:
valid_batch, local_labels = next(iter(val_datamodule.val_dataloader()))
valid_batch.payload['event_time'].shape

torch.Size([25, 40])

In [12]:
EMBD_SIZE = 1024
HIDDEN_SIZE = 32

valid_model = LocalValidationModel(
    backbone=model_churn,
    backbone_embd_size=EMBD_SIZE,
    hidden_size=HIDDEN_SIZE,
    use_lstm = False,
)

In [13]:
backbone_out = valid_model.backbone(valid_batch)
print("CoLES embeddings:", backbone_out.shape)

pred_out = valid_model(valid_batch)
print("Predicted labels:", pred_out.shape)

print("True local labels:", local_labels.shape)

CoLES embeddings: torch.Size([25, 1024])
Predicted labels: torch.Size([25])
True local labels: torch.Size([25])


In [14]:
val_trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=5,
)
    
val_trainer.fit(valid_model, val_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 4.3 M 
1 | pred_head | Sequential  | 32.8 K
2 | loss      | BCELoss     | 0     
------------------------------------------
32.8 K    Trainable params
4.3 M     Non-trainable params
4.4 M     Total params
17.530    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [15]:
val_trainer.test(valid_model, val_datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC              0.597362220287323
        Accuracy            0.5331345796585083
         F1Score             0.634706437587738
         PR-AUC             0.6008473634719849
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.597362220287323,
  'PR-AUC': 0.6008473634719849,
  'Accuracy': 0.5331345796585083,
  'F1Score': 0.634706437587738}]

## With LSTM

In [16]:
EMBD_SIZE = 1024
HIDDEN_SIZE = 32
LSTM_HIDDEN_SIZE = 256
LSTM_NUM_LAYERS = 1
LSTM_BIDIRECTIONAL = False

valid_model = LocalValidationModel(
    backbone=model_churn,
    backbone_embd_size=EMBD_SIZE,
    hidden_size=HIDDEN_SIZE,
    use_lstm = True,
    lstm_hidden_size = LSTM_HIDDEN_SIZE,
    lstm_num_layers = LSTM_NUM_LAYERS,
    lstm_bidirectional = LSTM_BIDIRECTIONAL
)

In [17]:
backbone_out = valid_model.backbone(valid_batch)
print("CoLES embeddings:", backbone_out.shape)

lstm_out = valid_model.lstm(backbone_out)[0]
print("LSTM outs:", lstm_out.shape)

pred_out = valid_model(valid_batch)
print("Predicted labels:", pred_out.shape)

print("True local labels:", local_labels.shape)

CoLES embeddings: torch.Size([25, 1024])
LSTM outs: torch.Size([25, 256])
Predicted labels: torch.Size([25])
True local labels: torch.Size([25])


In [18]:
val_trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=5,
)
    
val_trainer.fit(valid_model, val_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 4.3 M 
1 | lstm      | LSTM        | 1.3 M 
2 | pred_head | Sequential  | 41.0 K
3 | loss      | BCELoss     | 0     
------------------------------------------
1.4 M     Trainable params
4.3 M     Non-trainable params
5.7 M     Total params
22.814    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [19]:
val_trainer.test(valid_model, val_datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.6037269830703735
        Accuracy            0.5554093718528748
         F1Score            0.5954945683479309
         PR-AUC             0.5988376140594482
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.6037269830703735,
  'PR-AUC': 0.5988376140594482,
  'Accuracy': 0.5554093718528748,
  'F1Score': 0.5954945683479309}]