In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from pathlib import Path

import pandas as pd

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger, CometLogger

from sklearn.model_selection import train_test_split

from src.local_validation import LocalValidationModel

from src.utils.logging_utils import get_logger
from src.utils.data_utils.prepare_dataset import prepare_dataset


In [3]:
from src.pooling import PoolingModel

# Example of usage with churn dataset (pooling attention model)


In [4]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]
cfg_validation = cfg["validation"]

In [5]:
cfg["dataset"]["dir_path"]

'data/preprocessed_new'

In [6]:
# read data
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
...,...,...,...,...,...,...,...,...
490508,10215,37,2016-12-17 00:00:00,2110.9,0,0,1,0
490509,10215,1,2016-12-16 00:00:00,31.0,0,0,0,0
490510,10215,1,2016-12-06 00:00:00,182.0,0,0,0,0
490511,10215,2,2016-12-06 13:39:49,5000.0,0,0,0,0


In [7]:
# preprocess and split data

logger = get_logger(name=__name__)
dataset = prepare_dataset(cfg_preprop, logger)

valid_size = cfg_preprop["coles"]["valid_size"]
test_size = cfg_preprop["coles"]["test_size"]

train, val_test = train_test_split(
    dataset,
    test_size=valid_size+test_size,
    random_state=cfg_preprop["coles"]["random_state"]
)

val, test = train_test_split(
    val_test,
    test_size=test_size/(valid_size+test_size),
    random_state=cfg_preprop["coles"]["random_state"]
)

In [8]:
val[0].keys()

dict_keys(['user_id', 'global_target', 'holiday_target', 'weekend_target', 'churn_target', 'event_time', 'mcc_code', 'amount'])

In [9]:
# init backbone COLES model and load weights

sequence_encoder = instantiate(cfg_validation["sequence_encoder"])
sequence_encoder.load_state_dict(torch.load(cfg_validation["path_to_state_dict"]))

<All keys matched successfully>

In [10]:
# use datasets with no splits for the validation procedure

data_train = MemoryMapDataset(train, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_val = MemoryMapDataset(val, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_test = MemoryMapDataset(test, [SeqLenFilter(cfg_validation["model"]["seq_len"])])

train_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_train)
val_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_val)
test_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_test)

datamodule: PtlsDataModule = instantiate(
    cfg_validation["datamodule"],
    train_data=train_dataset,
    valid_data=val_dataset,
    test_data=test_dataset,
)

In [11]:
# init pooling model and load weights (attention)

pooling_model = PoolingModel(train_data = train,
        backbone = sequence_encoder,
        backbone_embd_size = 1024,
        max_users_in_train_dataloader=1000,
        pooling_type = "attention",
        min_seq_length = cfg_model["dataset"]["min_len"])

1000it [04:22,  3.81it/s]


In [13]:
batch, labels = next(iter(datamodule.train_dataloader()))

backbone_out = pooling_model.backbone(batch)
print("COLES embeddings shape:", backbone_out.shape)

pred_out = pooling_model(batch)
print("Pooling embeddings shape:", pred_out.shape)

print("True local labels shape:", labels.shape)

COLES embeddings shape: torch.Size([4, 1024])
Pooling embeddings shape: torch.Size([4, 2048])
True local labels shape: torch.Size([4])


In [15]:
# validation

seed_everything(42)

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=pooling_model,
    backbone_embd_size = pooling_model.get_emb_dim(),
    val_mode = "downstream"
)

Global seed set to 42


In [16]:
print("inputs event time:", batch.payload["event_time"].shape)

preds, mask = valid_model(batch)
target = valid_model._get_validation_labels(batch)

print("preds:", preds.shape)
print("mask:", mask.shape)
print("target:", target.shape)

inputs event time: torch.Size([4, 188])
preds: torch.Size([4, 157, 1])
mask: torch.Size([4, 157])
target: torch.Size([4, 157])


In [17]:
val_trainer: Trainer = instantiate(cfg_validation["trainer"])
    
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | backbone  | PoolingModel | 4.3 M 
1 | pred_head | Sequential   | 65.6 K
2 | loss      | BCELoss      | 0     
-------------------------------------------
65.6 K    Trainable params
4.3 M     Non-trainable params
4.4 M     Total params
17.519    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.6702052354812622
        Accuracy            0.7136972546577454
         F1Score            0.16452693939208984
         PR-AUC             0.44333720207214355
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.6702052354812622,
  'PR-AUC': 0.44333720207214355,
  'Accuracy': 0.7136972546577454,
  'F1Score': 0.16452693939208984}]

# Compare with just seqencoder

In [18]:
seed_everything(42)

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=sequence_encoder,
    val_mode = "downstream"
)

val_trainer: Trainer = instantiate(cfg_validation["trainer"])
    
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type          | Params
--------------------------------------------
0 | backbone  | RnnSeqEncoder | 4.3 M 
1 | pred_head | Sequential    | 32.8 K
2 | loss      | BCELoss       | 0     
--------------------------------------------
32.8 K    Trainable params
4.3 M     Non-trainable params
4.3 M     Total params
17.388    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.5971648693084717
        Accuracy            0.7087092995643616
         F1Score                    0.0
         PR-AUC             0.37126386165618896
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.5971648693084717,
  'PR-AUC': 0.37126386165618896,
  'Accuracy': 0.7087092995643616,
  'F1Score': 0.0}]

# Compare with pooling mean

In [19]:
pooling_model.pooling_type = "mean"

In [20]:
# validation

seed_everything(42)

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=pooling_model,
    backbone_embd_size = pooling_model.get_emb_dim(),
    val_mode = "downstream"
)

val_trainer: Trainer = instantiate(cfg_validation["trainer"])
    
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | backbone  | PoolingModel | 4.3 M 
1 | pred_head | Sequential   | 65.6 K
2 | loss      | BCELoss      | 0     
-------------------------------------------
65.6 K    Trainable params
4.3 M     Non-trainable params
4.4 M     Total params
17.519    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.6738234758377075
        Accuracy            0.7121840715408325
         F1Score           0.044647008180618286
         PR-AUC             0.45130330324172974
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.6738234758377075,
  'PR-AUC': 0.45130330324172974,
  'Accuracy': 0.7121840715408325,
  'F1Score': 0.044647008180618286}]

# Compare with pooling max

In [21]:
pooling_model.pooling_type = "max"

In [22]:
# validation

seed_everything(42)

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=pooling_model,
    backbone_embd_size = pooling_model.get_emb_dim(),
    val_mode = "downstream"
)

val_trainer: Trainer = instantiate(cfg_validation["trainer"])
    
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | backbone  | PoolingModel | 4.3 M 
1 | pred_head | Sequential   | 65.6 K
2 | loss      | BCELoss      | 0     
-------------------------------------------
65.6 K    Trainable params
4.3 M     Non-trainable params
4.4 M     Total params
17.519    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.6859035491943359
        Accuracy            0.7147060632705688
         F1Score            0.06997350603342056
         PR-AUC             0.47345176339149475
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.6859035491943359,
  'PR-AUC': 0.47345176339149475,
  'Accuracy': 0.7147060632705688,
  'F1Score': 0.06997350603342056}]