In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [3]:
from pathlib import Path

import pandas as pd

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule
from ptls.frames.coles.split_strategy import NoSplit
from ptls.frames.coles import ColesDataset

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger, CometLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomCoLES
from src.local_validation import LocalValidationModel
from src.custom_preprocessing import CustomDatetimeNormalization


In [4]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [7]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
...,...,...,...,...,...,...,...,...
490508,10215,37,2016-12-17 00:00:00,2110.9,0,0,1,0
490509,10215,1,2016-12-16 00:00:00,31.0,0,0,0,0
490510,10215,1,2016-12-06 00:00:00,182.0,0,0,0,0
490511,10215,2,2016-12-06 13:39:49,5000.0,0,0,0,0


In [8]:
min_timestamp = int(df["timestamp"].min().timestamp())

datetime_transformer = CustomDatetimeNormalization(
    min_timestamp=min_timestamp, col_name_original="timestamp"
)

preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time=datetime_transformer,
    cols_category=["mcc_code"],
    category_transformation="frequency",
    cols_numerical=["amount"],
    cols_first_item=["global_target"],
    return_records=True,
)

dataset = preprocessor.fit_transform(df)

train, val = train_test_split(dataset, test_size=.2)

In [18]:
model: CustomCoLES = instantiate(cfg_model["model"])
model.seq_encoder.is_reduce_sequence = True
    
#model.seq_encoder.load_state_dict(torch.load("saved_models/churn/coles/coles_best_state_dict_100_v2.pth"))
#model.load_state_dict(torch.load("coles_best_state_dict.pth"))

# Validation

## Use datasets with no splits for the new validation procedure

In [19]:
min_len = 32
col_time = "event_time"

data_train = MemoryMapDataset(train, [SeqLenFilter(min_len)])
data_val = MemoryMapDataset(val, [SeqLenFilter(min_len)])

train_data_no_split = ColesDataset(
    data_train,
    NoSplit(),
    col_time
)

val_data_no_split = ColesDataset(
    data_val,
    NoSplit(),
    col_time
)

datamodule_no_split = PtlsDataModule(
    train_data=train_data_no_split,
    valid_data=val_data_no_split,
    test_data=val_data_no_split,
    train_batch_size=4,
    valid_batch_size=4, 
    test_batch_size=4
)

# New validation model

In [20]:
seed_everything(42)

EMBD_SIZE = 1024
HIDDEN_SIZE = 32

"""
'val_mode' options:
    * 'donwstream' - using local targets (e.g. 'churn_target' or 'default_target')
    * 'return_time' - predicting return time (COTIC-style) - NOT READY YET
    * 'event_type' - predicting next event type (COTIC-style)
"""

valid_model = LocalValidationModel(
    backbone=model,
    backbone_embd_size=EMBD_SIZE,
    hidden_size=HIDDEN_SIZE,
    val_mode="downstream", 
    num_types=345, # = num_types in config + 1 (add 0 as padding type)
    learning_rate=1e-3,
    backbone_output_type="tensor",
    backbone_embd_mode="seq2vec",  
    seq_len=32,
    mask_col="mcc_code",
    local_label_col="churn_target"
)

Global seed set to 42


In [21]:
batch, labels = next(iter(datamodule_no_split.train_dataloader()))

print("inputs event time:", batch.payload["event_time"].shape)

preds, mask = valid_model(batch)
target = valid_model._get_validation_labels(batch)

print("preds:", preds.shape)
print("mask:", mask.shape)
print("target:", target.shape)

inputs event time: torch.Size([4, 100])
preds: torch.Size([4, 69, 1])
mask: torch.Size([4, 69])
target: torch.Size([4, 69])


In [22]:
#comet_logger = CometLogger(
#    api_key="agnHNC2vEt7tOxnnxT4LzYf7Y",
#    project_name="macro-micro-coles",
#    workspace="stalex2902",
#    experiment_name="New validation CoLES Churn event_type, 100 types",
#    display_summary_level=0
#)

val_trainer = Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=10,
    #logger=comet_logger
)
    
val_trainer.fit(valid_model, datamodule_no_split)
val_trainer.test(valid_model, datamodule_no_split)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /app/macro_micro_coles-lgbm_validation/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | backbone  | CustomCoLES | 4.3 M 
1 | pred_head | Sequential  | 32.8 K
2 | loss      | BCELoss     | 0     
------------------------------------------
32.8 K    Trainable params
4.3 M     Non-trainable params
4.3 M     Total params
17.388    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]