In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomColesValidationDataset, CustomCoLES
from src.local_validation import LocalValidationModel

2023-08-27 12:40:47.235939: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Use Age dataset with fake local labels

## Init original datasets and datamodule

In [48]:
DATASET = "age"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [24]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head()

Unnamed: 0,user_id,timestamp,mcc_code,amount,global_target
0,33172,6,4,71.463,0
1,33172,6,35,45.017,0
2,33172,8,11,13.887,0
3,33172,9,11,15.983,0
4,33172,10,11,21.341,0


In [25]:
# add fake local label for Age
df["fake_local_label"] = np.ones(len(df)).astype(int)
df.head()

Unnamed: 0,user_id,timestamp,mcc_code,amount,global_target,fake_local_label
0,33172,6,4,71.463,0,1
1,33172,6,35,45.017,0,1
2,33172,8,11,13.887,0,1
3,33172,9,11,15.983,0,1
4,33172,10,11,21.341,0,1


In [26]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="none", # no time preprocessing
    cols_category=["mcc_code"],
    cols_numerical=["amount", "fake_local_label"], # keep column with fake local targets
    return_records=True
)

dataset = preprocessor.fit_transform(df)

train, val = train_test_split(dataset, test_size=.2)

In [27]:
# initialize original CoLES datasest - for CoLES training
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)
    
train_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

# Initialize and train CoLES model

In [28]:
model: CustomCoLES = instantiate(cfg_model["model"])

In [29]:
model_checkpoint: ModelCheckpoint = instantiate(
    cfg_model["trainer_coles"]["checkpoint_callback"],
    monitor=model.metric_name,
    mode="max"
)
    
early_stopping: EarlyStopping = instantiate(
    cfg_model["trainer_coles"]["early_stopping"],
    monitor=model.metric_name,
    mode="max"
)
    
logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])
    
trainer: Trainer = instantiate(
    cfg_model["trainer_coles"]["trainer"],
    callbacks=[model_checkpoint, early_stopping],
    logger=logger
)
    
trainer.fit(model, train_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 12.9 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
12.9 K    Trainable params
0         Non-trainable params
12.9 K    Total params
0.052     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved. New best score: 0.081


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.103 >= min_delta = 0.01. New best score: 0.185


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.067 >= min_delta = 0.01. New best score: 0.252


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.042 >= min_delta = 0.01. New best score: 0.294


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.019 >= min_delta = 0.01. New best score: 0.313


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.013 >= min_delta = 0.01. New best score: 0.326


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.011 >= min_delta = 0.01. New best score: 0.338


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.012 >= min_delta = 0.01. New best score: 0.349


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric recall_top_k did not improve in the last 5 records. Best score: 0.349. Signaling Trainer to stop.


In [33]:
torch.save(model.state_dict(), "saved_models/coles_age_default.pth")

# Local validation pipeline

In [49]:
# initialize custom datasets and datamodule for local validation
# use the same 'train' and 'val' preprocessed data
train_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=train)
val_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=val)

# keep batch_size = 1 (all slices of one user in one batch)
# or may use batch_size > 1 to speed-up LocalValidationModel training
val_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data_local,
    valid_data=val_data_local,
    test_data=val_data_local,
    train_batch_size=10,
    valid_batch_size=1,
    test_batch_size=1
)

### Investigate validation batch structure 

In [35]:
valid_batch, local_labels = next(iter(val_datamodule.val_dataloader()))

In [36]:
# original format + local_targets, local_targets are not passed to CoLES model
# each tensor is of shape (num_slices, seq_len), num_slices = (num_timesteps - seq_len) // stride
# using stride=1 is not feasible as it results in too much slices - training of LocalValidationModel is too long

valid_batch.payload['timestamp'].shape, valid_batch.payload

(torch.Size([77, 40]),
 {'timestamp': tensor([[  1,   2,   2,  ...,  43,  46,  47],
          [ 13,  15,  19,  ...,  60,  62,  62],
          [ 28,  29,  29,  ...,  71,  72,  72],
          ...,
          [678, 678, 679,  ..., 707, 708, 709],
          [683, 683, 684,  ..., 713, 713, 714],
          [693, 694, 697,  ..., 723, 724, 725]]),
  'global_target': tensor([[3, 3, 3,  ..., 3, 3, 3],
          [3, 3, 3,  ..., 3, 3, 3],
          [3, 3, 3,  ..., 3, 3, 3],
          ...,
          [3, 3, 3,  ..., 3, 3, 3],
          [3, 3, 3,  ..., 3, 3, 3],
          [3, 3, 3,  ..., 3, 3, 3]]),
  'event_time': tensor([[  1,   2,   2,  ...,  43,  46,  47],
          [ 13,  15,  19,  ...,  60,  62,  62],
          [ 28,  29,  29,  ...,  71,  72,  72],
          ...,
          [678, 678, 679,  ..., 707, 708, 709],
          [683, 683, 684,  ..., 713, 713, 714],
          [693, 694, 697,  ..., 723, 724, 725]]),
  'mcc_code': tensor([[ 3, 14, 11,  ...,  1,  3,  2],
          [ 1, 19,  3,  ...,  3,  7,

In [37]:
# corresponding local labels (fake for Age)
local_labels.shape

torch.Size([77])

## CoLES embeddings for a single user

If batch_size is 1, than batch contains all slices (consequtive sliding windows) for 1 user. As a result, we get embedding of shape (slices_num, embd_size)

In [50]:
emb = model(valid_batch)

# shape is (num_slices, embd_size)
emb.shape

torch.Size([77, 32])

## Local validation pipeline

In [59]:
EMBD_SIZE = 32
HIDDEN_SIZE = 32

valid_model = LocalValidationModel(
    backbone=model,
    backbone_embd_size=EMBD_SIZE,
    hidden_size=HIDDEN_SIZE,
)

In [60]:
backbone_out = valid_model.backbone(valid_batch)
print("CoLES embeddings:", backbone_out.shape)

pred_out = valid_model(valid_batch)
print("Predicted labels:", pred_out.shape)

print("True local labels:", local_labels.shape)

CoLES embeddings: torch.Size([77, 32])
Predicted labels: torch.Size([77])
True local labels: torch.Size([77])


In [52]:
val_trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=1,
)
    
val_trainer.fit(valid_model, val_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 12.9 K
1 | pred_head | Sequential  | 1.1 K 
2 | loss      | BCELoss     | 0     
------------------------------------------
1.1 K     Trainable params
12.9 K    Non-trainable params
14.0 K    Total params
0.056     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [56]:
torch.save(valid_model.state_dict(), "saved_models/validation_age.pth")

In [66]:
valid_model.load_state_dict(torch.load("saved_models/validation_age.pth"))

<All keys matched successfully>

In [67]:
val_trainer.test(valid_model, val_datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

[{'Test acc': 1.0, 'Test f1_score': 1.0}]

# Churn dataset with real local targets

In [3]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [4]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,5200,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,6011,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,5921,2017-12-05 00:00:00,767.0,0,0,0,1
3,0,5411,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,6012,2017-10-24 13:14:24,36562.0,0,0,0,0


In [5]:
local_target = cfg_model["validation_dataset"]["local_target_col"]

preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp", # no time preprocessing
    cols_category=["mcc_code"],
    cols_numerical=["amount", local_target], # keep column with fake local targets
    return_records=True
)

dataset = preprocessor.fit_transform(df)

train, val = train_test_split(dataset, test_size=.2)

In [6]:
# initialize original CoLES datasest - for CoLES training
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)
    
train_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [7]:
model_churn: CustomCoLES = instantiate(cfg_model["model"])

In [10]:
model_checkpoint: ModelCheckpoint = instantiate(
    cfg_model["trainer_coles"]["checkpoint_callback"],
    monitor=model_churn.metric_name,
    mode="max"
)
    
early_stopping: EarlyStopping = instantiate(
    cfg_model["trainer_coles"]["early_stopping"],
    monitor=model_churn.metric_name,
    mode="max"
)
    
logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])
    
trainer: Trainer = instantiate(
    cfg_model["trainer_coles"]["trainer"],
    callbacks=[model_checkpoint, early_stopping],
    logger=logger
)
    
trainer.fit(model_churn, train_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 17.5 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
17.5 K    Trainable params
0         Non-trainable params
17.5 K    Total params
0.070     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved. New best score: 0.335


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.038 >= min_delta = 0.01. New best score: 0.372


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.025 >= min_delta = 0.01. New best score: 0.398


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.016 >= min_delta = 0.01. New best score: 0.414


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.015 >= min_delta = 0.01. New best score: 0.429


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.016 >= min_delta = 0.01. New best score: 0.445


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.012 >= min_delta = 0.01. New best score: 0.457


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric recall_top_k did not improve in the last 5 records. Best score: 0.457. Signaling Trainer to stop.


In [12]:
torch.save(model_churn.state_dict(), "saved_models/coles_churn_default.pth")

In [8]:
model_churn.load_state_dict(torch.load("saved_models/coles_churn_default.pth"))

<All keys matched successfully>

## Local target validation

In [9]:
# initialize custom datasets and datamodule for local validation
# use the same 'train' and 'val' preprocessed data
train_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=train)
val_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=val)

# keep batch_size = 1 (all slices of one user in one batch)
# or may use batch_size > 1 to speed-up LocalValidationModel training
val_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data_local,
    valid_data=val_data_local,
    test_data=val_data_local,
    train_batch_size=10,
    valid_batch_size=1,
    test_batch_size=1
)

In [10]:
valid_batch, local_labels = next(iter(val_datamodule.val_dataloader()))

valid_batch.payload['event_time'].shape, valid_batch.payload

(torch.Size([129, 40]),
 {'global_target': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]]),
  'holiday_target': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]),
  'weekend_target': tensor([[0, 0, 0,  ..., 1, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 1,  ..., 1, 1, 1],
          [0, 1, 1,  ..., 1, 1, 1],
          [1, 1, 0,  ..., 1, 1, 1]]),
  'event_time': tensor([[1479686400, 1479686400, 1479772800,  ..., 1481414400, 1481500800,
           1481535960],
          [1479686400, 1479772800, 1479772800,  ..., 1481500800, 1481535960,
           1481587200],
        

In [11]:
local_labels.shape

torch.Size([129])

In [12]:
EMBD_SIZE = 32
HIDDEN_SIZE = 32

valid_model = LocalValidationModel(
    backbone=model_churn,
    backbone_embd_size=EMBD_SIZE,
    hidden_size=HIDDEN_SIZE,
)

In [13]:
backbone_out = valid_model.backbone(valid_batch)
print("CoLES embeddings:", backbone_out.shape)

pred_out = valid_model(valid_batch)
print("Predicted labels:", pred_out.shape)

print("True local labels:", local_labels.shape)

CoLES embeddings: torch.Size([129, 32])
Predicted labels: torch.Size([129])
True local labels: torch.Size([129])


In [14]:
val_trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=5,
)
    
val_trainer.fit(valid_model, val_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 17.5 K
1 | pred_head | Sequential  | 1.1 K 
2 | loss      | BCELoss     | 0     
------------------------------------------
1.1 K     Trainable params
17.5 K    Non-trainable params
18.6 K    Total params
0.074     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [15]:
torch.save(valid_model.state_dict(), "saved_models/validation_churn.pth")

In [16]:
val_trainer.test(valid_model, val_datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

[{'Test acc': 0.5216588973999023, 'Test f1_score': 0.011062705889344215}]