In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from hydra import initialize, compose
from hydra.utils import instantiate
import torch
import numpy as np
import pandas as pd

from src.preprocessing import preprocess
from src.pooling import PoolingModel

from src.global_validation.global_validation_pipeline import embed_data, eval_embeddings

In [3]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_validation_churn")

cfg_preprop = cfg["preprocessing"]
cfg_validation = cfg["validation"]["global_target"]
cfg_encoder = cfg["backbone"]["encoder"]


In [4]:
cfg_encoder["trx_encoder"]["embeddings"]["mcc_code"]["in"] = 345

In [5]:
train, val, test = preprocess(cfg_preprop)

In [6]:
encoder_name = "coles_churn"
sequence_encoder = instantiate(cfg_encoder, is_reduce_sequence=True)
sequence_encoder.load_state_dict(torch.load(f"saved_models/{encoder_name}.pth"))


<All keys matched successfully>

In [7]:
sequence_encoder.seq_encoder.rnn.hidden_size

1024

# Attention pooling

In [8]:
pooling_model = PoolingModel(train_data = train,
        backbone = sequence_encoder,
        backbone_embd_size = sequence_encoder.seq_encoder.rnn.hidden_size,
        max_users_in_train_dataloader=500,
        pooling_type = "attention",
        min_seq_length = 15,
        max_seq_length = 100,
        max_embs_per_user = 100)

 15%|█▍        | 590/4000 [01:13<07:04,  8.04it/s]
100%|██████████| 4210/4210 [00:09<00:00, 467.13it/s]


In [9]:
# get representations of sequences from train + val part
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

# get representations of sequences from test part
embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    # bootstrap sample
    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    # evaluate trained model
    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [10]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.751386,0.801715,0.698,0.743633
1,0.762242,0.805629,0.706,0.752108
2,0.742452,0.786899,0.682,0.735441
3,0.741768,0.789662,0.708,0.749141
4,0.7537,0.803366,0.694,0.747941
5,0.757629,0.800113,0.724,0.766892
6,0.752527,0.794137,0.696,0.739726
7,0.744914,0.801099,0.684,0.733108
8,0.737432,0.789842,0.68,0.726962
9,0.753684,0.790736,0.708,0.751701


In [11]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.749773,0.79632,0.698,0.744665
std,0.007829,0.006779,0.013888,0.011485


# Mean pooling

In [12]:
pooling_model.pooling_type = "mean"

In [13]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [14]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.759993,0.807043,0.694,0.742857
1,0.764785,0.821024,0.702,0.747885
2,0.763921,0.819725,0.7,0.744898
3,0.762683,0.80955,0.694,0.744574
4,0.778984,0.830819,0.71,0.752981
5,0.75582,0.804151,0.678,0.729412
6,0.766465,0.81501,0.704,0.753333
7,0.773001,0.831362,0.702,0.750419
8,0.756537,0.803116,0.68,0.732441
9,0.760629,0.813666,0.68,0.731544


In [15]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.764282,0.815547,0.6944,0.743034
std,0.007174,0.01015,0.011384,0.008937


# Max pooling

In [16]:
pooling_model.pooling_type = "max"

In [17]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [18]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.785065,0.833013,0.724,0.771523
1,0.767231,0.819703,0.704,0.752508
2,0.766579,0.824111,0.698,0.745363
3,0.775773,0.827008,0.712,0.755932
4,0.780386,0.837252,0.696,0.743243
5,0.760221,0.818162,0.674,0.72326
6,0.763432,0.821092,0.706,0.751269
7,0.772121,0.816919,0.704,0.74744
8,0.771029,0.824572,0.69,0.734134
9,0.759324,0.809436,0.69,0.735043


In [19]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.770116,0.823127,0.6998,0.745972
std,0.00847,0.008041,0.013677,0.013368


# Without pooling

In [20]:
embeddings, targets = embed_data(sequence_encoder, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    sequence_encoder,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [21]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.733038,0.770301,0.692,0.738095
1,0.738483,0.797514,0.662,0.712095
2,0.712075,0.773563,0.658,0.712605
3,0.73779,0.787723,0.692,0.745033
4,0.72902,0.778552,0.658,0.715474
5,0.72589,0.775334,0.67,0.716007
6,0.74004,0.773981,0.694,0.738462
7,0.734163,0.778188,0.68,0.726027
8,0.7396,0.786991,0.68,0.733333
9,0.74414,0.792434,0.678,0.728499


In [22]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.733424,0.781458,0.6764,0.726563
std,0.009274,0.009113,0.013946,0.012038


# Learnable attention


In [23]:
pooling_model.change_pooling_type("learnable_attention")

In [24]:
# with initialize(config_path="../config/model", version_base=None):
#     cfg_model = compose(config_name="coles_churn")

In [25]:
# from src.coles import CustomColesDataset, CustomCoLES

# from pytorch_lightning import Trainer
# from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
# from pytorch_lightning.loggers import TensorBoardLogger

# from ptls.frames import PtlsDataModule

In [26]:
# model: CustomCoLES = instantiate(cfg_model["model"],
#                                  sequence_encoder = pooling_model)

In [27]:
# # initialize original CoLES datasest - for CoLES training
# train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
# val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)
    
# train_datamodule: PtlsDataModule = instantiate(
#     cfg_model["datamodule"],
#     train_data=train_data,
#     valid_data=val_data
# )

In [28]:
# model_checkpoint: ModelCheckpoint = instantiate(
#     cfg_model["trainer_coles"]["checkpoint_callback"],
#     monitor=model.metric_name,
#     mode="max"
# )
    
# early_stopping: EarlyStopping = instantiate(
#     cfg_model["trainer_coles"]["early_stopping"],
#     monitor=model.metric_name,
#     mode="max"
# )
    
# logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])
    
# trainer: Trainer = instantiate(
#     cfg_model["trainer_coles"]["trainer"],
#     callbacks=[model_checkpoint, early_stopping],
#     logger=logger
# )
    
# trainer.fit(model, train_datamodule)

In [29]:
# torch.save(model.seq_encoder.learnable_attention_matrix.state_dict(), "saved_models/coles_churn_learnable_attention_matrix.pth")

In [30]:
# pooling_model.learnable_attention_matrix.load_state_dict(torch.load(f"saved_models/coles_default_learnable_attention_matrix.pth"))
pooling_model.learnable_attention_matrix.load_state_dict(torch.load(f"saved_models/coles_churn_learnable_attention_matrix.pth"))

<All keys matched successfully>

In [31]:
pooling_model.pooling_type

'learnable_attention'

In [32]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [33]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.762683,0.803368,0.708,0.754209
1,0.760319,0.809697,0.712,0.762376
2,0.758363,0.814696,0.69,0.740368
3,0.758428,0.81321,0.692,0.749186
4,0.763025,0.811941,0.7,0.744898
5,0.763547,0.81994,0.704,0.74744
6,0.755331,0.813412,0.684,0.734899
7,0.767084,0.827133,0.712,0.754266
8,0.750864,0.80702,0.672,0.722973
9,0.757091,0.802768,0.704,0.745704


In [34]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.759673,0.812318,0.6978,0.745632
std,0.004679,0.007391,0.013079,0.011062
