In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
import torch
import torch.nn as nn

import pytorch_lightning as pl

from pathlib import Path

import pandas as pd
import numpy as np

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset, CustomColesValidationDataset, CustomCoLES

2023-08-24 09:12:57.280370: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Init original datasets and datamodule

In [4]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_age")

In [5]:
df = pd.read_csv(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))

# add fake local label for Age
df["local_label"] = np.ones(len(df)).astype(int)
df.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur,local_label
0,33172,6,4,71.463,1
1,33172,6,35,45.017,1
2,33172,8,11,13.887,1
3,33172,9,11,15.983,1
4,33172,10,11,21.341,1


In [7]:
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [8]:
user_column : str = cfg_preprop["user_column"]
dttm_column : str = cfg_preprop["dttm_column"]
mcc_column  : str = cfg_preprop["mcc_column"]
amt_column  : str = cfg_preprop["amt_column"]

In [9]:
df.rename(columns={
    user_column: "user_id",
    mcc_column: "mcc_code",
    amt_column: "amount",
    dttm_column: "timestamp"
}, inplace=True)

In [11]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="none",
    cols_category=["mcc_code"],
    # keep column with local targets
    cols_numerical=["amount", "local_label"],
    return_records=True
)

dataset = preprocessor.fit_transform(df)

train, val = train_test_split(dataset, test_size=.2)

In [13]:
# initialize original CoLES datasest - for CoLES training
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)
    
train_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

# Initialize and train CoLES model

In [66]:
model: CustomCoLES = instantiate(cfg_model["model"])

In [17]:
model_checkpoint: ModelCheckpoint = instantiate(
    cfg_model["trainer_coles"]["checkpoint_callback"],
    monitor=model.metric_name,
    mode="max"
)
    
early_stopping: EarlyStopping = instantiate(
    cfg_model["trainer_coles"]["early_stopping"],
    monitor=model.metric_name,
    mode="max"
)
    
logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])
    
trainer: Trainer = instantiate(
    cfg_model["trainer_coles"]["trainer"],
    callbacks=[model_checkpoint, early_stopping],
    logger=logger
)
    
trainer.fit(model, train_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 18.5 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
18.5 K    Trainable params
0         Non-trainable params
18.5 K    Total params
0.074     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved. New best score: 0.033


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.034 >= min_delta = 0.01. New best score: 0.067


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.026 >= min_delta = 0.01. New best score: 0.094


Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.017 >= min_delta = 0.01. New best score: 0.110


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.012 >= min_delta = 0.01. New best score: 0.122


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric recall_top_k improved by 0.010 >= min_delta = 0.01. New best score: 0.133


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric recall_top_k did not improve in the last 5 records. Best score: 0.133. Signaling Trainer to stop.


# Local validation pipeline

In [49]:
# initialize custom datasets and datamodule for local validation
train_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=train)
val_data_local: CustomColesValidationDataset = instantiate(cfg_model["validation_dataset"], data=val)

# keep batch_size = 1 (all slices of one user in one batch)
# or may use batch_size > 1 for valudation model training
val_datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data_local,
    valid_data=val_data_local,
    train_batch_size=64,
    valid_batch_size=1
)

In [42]:
valid_batch, local_labels = next(iter(val_datamodule.val_dataloader()))

In [46]:
valid_batch.payload["timestamp"]

torch.Size([1688, 40])

In [26]:
# original format + local_targets, local_targets are not passed to CoLES model (check..)
# each tensor is of shape (num_slices, seq_len), num_slices = num_timesteps - seq_len

valid_batch.payload['timestamp'].shape, valid_batch.payload

(torch.Size([1005, 40]),
 {'timestamp': tensor([[  0,   3,   3,  ...,  25,  25,  25],
          [  3,   3,   5,  ...,  25,  25,  26],
          [  3,   5,   6,  ...,  25,  26,  29],
          ...,
          [696, 698, 704,  ..., 727, 727, 728],
          [698, 704, 704,  ..., 727, 728, 728],
          [704, 704, 705,  ..., 728, 728, 729]]),
  'event_time': tensor([[  0,   3,   3,  ...,  25,  25,  25],
          [  3,   3,   5,  ...,  25,  25,  26],
          [  3,   5,   6,  ...,  25,  26,  29],
          ...,
          [696, 698, 704,  ..., 727, 727, 728],
          [698, 704, 704,  ..., 727, 728, 728],
          [704, 704, 705,  ..., 728, 728, 729]]),
  'mcc_code': tensor([[ 8,  6,  8,  ...,  5,  6,  2],
          [ 6,  8,  8,  ...,  6,  2,  6],
          [ 8,  8, 21,  ...,  2,  6,  6],
          ...,
          [ 2, 42,  1,  ...,  6,  2, 38],
          [42,  1,  2,  ...,  2, 38,  2],
          [ 1,  2,  1,  ..., 38,  2, 22]]),
  'amount': tensor([[  18.8410,  303.1650,   57.4410,  ..

In [28]:
# corresponding local labels (fake for Age)
local_labels.shape

torch.Size([1005])

## Validation module

In [67]:
from torchmetrics.classification import BinaryF1Score

class LocalValidationModel(pl.LightningModule):
    def __init__(
        self,
        backbone: nn.Module,
        backbone_embd_size: int,
        hidden_size: int,        
        learning_rate: float = 1e-3,
    ) -> None:
        super().__init__()
        
        self.backbone = backbone
        self.backbone.to(self.device)
        
        # freeze backbone model
        for param in self.backbone.parameters():
            param.requires_grad = False
        
        self.pred_head = nn.Sequential(
            nn.Linear(backbone_embd_size, hidden_size),
            nn.ReLU(),
            nn.Linear(backbone_embd_size, 1),
            nn.Sigmoid()
        )
        
        # BCE loss for seq2seq binary classification
        self.loss = nn.BCELoss()
        
        self.lr = learning_rate

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        out = self.backbone(inputs)
        out = self.pred_head(out).squeeze()
        return out

    def training_step(self, batch: torch.Tensor, batch_idx: int):
        
        inputs, labels = batch
        pred = self.forward(inputs)

        train_loss = self.loss(pred, labels.float())
        train_accuracy = ((pred.squeeze() > 0.5).long() == labels).float().mean()

        self.log("train_loss", train_loss, prog_bar=True)
        self.log("train_acc", train_accuracy, prog_bar=True)

        return {"loss": train_loss, "acc": train_accuracy}

    def validation_step(self, batch: torch.Tensor, batch_idx: int):
        inputs, labels = batch
        pred = self.forward(inputs)

        val_loss = self.loss(pred, labels.float())
        val_accuracy = ((pred.squeeze() > 0.5).long() == labels).float().mean()

        self.log("val_loss", val_loss, prog_bar=True)
        self.log("val_acc", val_accuracy, prog_bar=True)

        return {"loss": val_loss, "acc": val_accuracy}
    
    def test_step(self, batch: torch.Tensor, batch_idx: int):
        inputs, labels = batch
        pred = self.forward(inputs)

        acc_score = ((pred.squeeze() > 0.5).long() == labels).float().mean().item()
        f1_score = BinaryF1Score(all_preds, all_labels).item()

        self.log("Test acc", acc_score)
        self.log("Test f1_score", f1_score)
        
        return {"pred": pred, "labels": labels, "acc": acc_score, "f1": f1_score}
    
    def test_epoch_end(self, outputs):
        all_preds = torch.stack([batch_out["pred"] for batch_out in outputs])
        all_labels = torch.stack([batch_out["labels"] for batch_out in outputs])
        
        acc = (all_preds == all_labels).mean().item()
        f1_score = BinaryF1Score(all_preds, all_labels)
        
        self.log("Test acc", acc)
        

    def configure_optimizers(self) -> torch.optim.Optimizer:
        opt = torch.optim.Adam(self.pred_head.parameters(), lr=self.lr)
        return opt

In [68]:
valid_model = LocalValidationModel(
    backbone=model,
    backbone_embd_size=32,
    hidden_size=32,
)

In [69]:
backbone_out = valid_model.backbone(valid_batch)
print("CoLES embeddings:", backbone_out.shape)

pred_out = valid_model(valid_batch)
print("Predicted labels:", pred_out.shape)

print("True local labels:", local_labels.shape)

CoLES embeddings: torch.Size([1688, 32])
Predicted labels: torch.Size([1688])
True local labels: torch.Size([1688])


In [None]:
valid_model.

In [70]:
val_trainer = Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=1,
)
    
val_trainer.fit(valid_model, val_datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 18.5 K
1 | pred_head | Sequential  | 1.1 K 
2 | loss      | BCELoss     | 0     
------------------------------------------
1.1 K     Trainable params
18.5 K    Non-trainable params
19.6 K    Total params
0.078     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

KeyboardInterrupt: 