In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('/app')

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger, CometLogger

from sklearn.model_selection import train_test_split

from src.local_validation import LocalValidationModel

from src.utils.logging_utils import get_logger
from src.preprocessing import preprocess


## Read data

In [4]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)

cfg_preprop = cfg["preprocessing"]
cfg_dataset = cfg["dataset"]
# cfg_model = cfg["model"]
cfg_validation = cfg["validation"]

In [5]:
df = pd.read_parquet(Path(cfg["preprocessing"]["source"]))
df

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
...,...,...,...,...,...,...,...,...
490508,10215,37,2016-12-17 00:00:00,2110.9,0,0,1,0
490509,10215,1,2016-12-16 00:00:00,31.0,0,0,0,0
490510,10215,1,2016-12-06 00:00:00,182.0,0,0,0,0
490511,10215,2,2016-12-06 13:39:49,5000.0,0,0,0,0


## Preprocess and split data

In [6]:
train, val, test = preprocess(cfg_preprop)

[Memory]0.0s, 0.0min    : Loading _preprocess...
__________________________________________preprocess cache loaded - 5.6s, 0.1min


# Validation

## Use datasets with no splits for the new validation procedure

In [7]:
import numpy as np
from ptls.frames.coles.split_strategy import AbsSplit
class SlidingSampler(AbsSplit):
    def __init__(self, slide_len, stride=1, max_seq_len=None, **kwargs):
        self.slide_len = slide_len
        self.stride = stride
        self.max_seq_len = max_seq_len

    def split(self, dates):
        date_len = dates.shape[0]
        date_range = np.arange(date_len)

        if date_len <= self.slide_len:
            return [date_range]

        if self.max_seq_len and (date_len >= self.max_seq_len): # TODO for  experiments 
            #random_subseq_start = np.random.randint(0, date_len - self.max_seq_len, 1)[0] TODO fix random case 
            #date_range = date_range[random_subseq_start: random_subseq_start + self.max_seq_len]
            date_range = date_range[:self.max_seq_len + 1]            
            date_len = self.max_seq_len
            
        # starting positions for all the windows
        start_pos = date_range[0 : date_len - self.slide_len + 1 : self.stride]
        return [date_range[s : s + self.slide_len] for s in start_pos]

In [8]:
splitter = SlidingSampler(slide_len=32, max_seq_len=500)

In [9]:
data_train = MemoryMapDataset(train, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_val = MemoryMapDataset(val, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_test = MemoryMapDataset(test, [SeqLenFilter(cfg_validation["model"]["seq_len"])])

train_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_train, splitter=splitter)
val_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_val, splitter=splitter)
test_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_test, splitter=splitter)


datamodule: PtlsDataModule = instantiate(
    cfg_validation["datamodule"],
    train_data=train_dataset,
    valid_data=val_dataset,
    test_data=test_dataset,
)

# New validation model

In [10]:
batch, _ = next(iter(datamodule.train_dataloader()))

In [11]:
from ptls.nn.seq_step import LastStepEncoder
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
import torch.nn as nn

In [12]:
from ptls.data_load.padded_batch import PaddedBatch

class IdentitySeq(AbsSeqEncoder):
    def __init__(self, input_size=None, is_reduce_sequence=True):
        super().__init__(is_reduce_sequence=is_reduce_sequence)
        self.reducer = LastStepEncoder()
        
    def forward(self, x: PaddedBatch):
        if self.is_reduce_sequence:
            return self.reducer(x)        
        return x

class IdentitySeqEncoder(torch.nn.Module): # is_reduce_sequence
    def __init__(self,
                 trx_encoder=None,
                 is_reduce_sequence=True):
        super().__init__()
        self.trx_encoder = trx_encoder
        self.seq_encoder = IdentitySeq(is_reduce_sequence)

    def forward(self, x):
        z_trx = self.trx_encoder(x)
        out = self.seq_encoder(z_trx)
        return out

    @property
    def is_reduce_sequence(self):
        return self.seq_encoder.is_reduce_sequence



In [13]:
seed_everything(42)

sequence_encoder = instantiate(cfg_validation["sequence_encoder"])
#sequence_encoder = sequence_encoder.trx_encoder
#sequence_encoder.load_state_dict(torch.load(cfg_validation["path_to_state_dict"]))


#my_seq_enc = IdentitySeqEncoder(sequence_encoder.trx_encoder)
"""
'val_mode' options:
    * 'donwstream' - using local targets (e.g. 'churn_target' or 'default_target')
    * 'return_time' - predicting return time (COTIC-style) - NOT READY YET
    * 'event_type' - predicting next event type (COTIC-style)
"""

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=sequence_encoder 
)

Global seed set to 42


In [14]:
valid_model

LocalValidationModel(
  (backbone): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (mcc_code): NoisyEmbedding(
          344, 24, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (numeric_values): ModuleDict(
        (amount): IdentityScaler()
      )
      (numerical_batch_norm): RBatchNormWithLens(
        (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
      )
    )
    (seq_encoder): RnnEncoder(
      (rnn): LSTM(25, 1024, batch_first=True)
      (reducer): LastStepEncoder()
    )
  )
  (pred_head): Linear(in_features=1024, out_features=344, bias=True)
  (loss): CrossEntropyLoss()
  (train_metric): MulticlassAccuracy()
  (test_metrics): MetricCollection(
    (AUROC): MulticlassAUROC()
    (Accuracy): MulticlassAccuracy()
    (F1Score): MulticlassF1Score()
    (PR-AUC): MulticlassAveragePrecision()
  )
  (val_metric): MulticlassAccuracy()
)

In [15]:
cfg_validation["trainer"]["max_epochs"] = 500

# cfg_validation["trainer"]["accelerator"] = "cpu"
# cfg_validation["trainer"]["devices"] = 1

val_trainer: Trainer = instantiate(cfg_validation["trainer"], logger=False, fast_dev_run=10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 10 batch(es).
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [16]:
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name         | Type               | Params
----------------------------------------------------
0 | backbone     | RnnSeqEncoder      | 4.3 M 
1 | pred_head    | Linear             | 352 K 
2 | loss         | CrossEntropyLoss   | 0     
3 | train_metric | MulticlassAccuracy | 0     
4 | test_metrics | MetricCollection   | 0     
5 | val_metric   | MulticlassAccuracy | 0     
----------------------------------------------------
352 K     Trainable params
4.3 M     Non-trainable params
4.7 M     Total params
18.667    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          AUROC             0.12531328201293945
        Accuracy           0.0051452782936394215
         F1Score           0.0051452782936394215
         PR-AUC            0.013722222298383713
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'AUROC': 0.12531328201293945,
  'Accuracy': 0.0051452782936394215,
  'F1Score': 0.0051452782936394215,
  'PR-AUC': 0.013722222298383713}]