In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from hydra import initialize, compose
from hydra.utils import instantiate
import torch
import numpy as np
import pandas as pd

from src.preprocessing import preprocess
from src.pooling import PoolingModel

from src.global_validation.global_validation_pipeline import embed_data, eval_embeddings

In [3]:
config_name = "pooling_global_validation_default"

backbone_path = "saved_models/default/coles_default/coles_default_4.pth"
path_for_attention = "saved_models/coles_default_learnable_attention_matrix_4.pth"
SEED = 46

In [4]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name=config_name)

cfg_preprop = cfg["preprocessing"]
cfg_validation = cfg["validation"]["global_target"]
cfg_encoder = cfg["backbone"]["encoder"]


In [5]:
train, val, test = preprocess(cfg_preprop)

In [6]:
encoder_name = "coles_churn"
sequence_encoder = instantiate(cfg_encoder, is_reduce_sequence=True)
sequence_encoder.load_state_dict(torch.load(backbone_path, map_location = "cuda:0"))


<All keys matched successfully>

In [7]:
sequence_encoder.seq_encoder.rnn.hidden_size

800

# Attention pooling

In [8]:
pooling_model = PoolingModel(train_data = train,
        backbone = sequence_encoder,
        backbone_embd_size = sequence_encoder.seq_encoder.rnn.hidden_size,
        max_users_in_train_dataloader=150,
        pooling_type = "attention",
        min_seq_length = 15,
        max_seq_length = 100,
        max_embs_per_user = 100)

  3%|▎         | 149/5664 [00:01<01:09, 78.94it/s]
100%|██████████| 14982/14982 [00:17<00:00, 868.74it/s]


In [9]:
# get representations of sequences from train + val part
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

# get representations of sequences from test part
embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    # bootstrap sample
    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    # evaluate trained model
    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [10]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.570838,0.048173,0.961864,0.0
1,0.557078,0.045904,0.961864,0.0
2,0.599445,0.070714,0.961864,0.0
3,0.561592,0.048587,0.961864,0.0
4,0.571056,0.048549,0.961864,0.0
5,0.549084,0.065539,0.961864,0.0
6,0.559308,0.049514,0.961864,0.0
7,0.574754,0.050801,0.961864,0.0
8,0.584163,0.060445,0.961864,0.0
9,0.556371,0.046602,0.961864,0.0


In [11]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.568369,0.053483,0.961864,0.0
std,0.015058,0.008788,0.0,0.0


# Mean pooling

In [12]:
pooling_model.pooling_type = "mean"

In [13]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [14]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.541633,0.050235,0.961864,0.0
1,0.528961,0.049167,0.961864,0.0
2,0.570729,0.057783,0.961864,0.0
3,0.563713,0.046203,0.961864,0.0
4,0.578289,0.055,0.961864,0.0
5,0.586175,0.057063,0.961864,0.0
6,0.571491,0.058906,0.961864,0.0
7,0.562735,0.05097,0.961864,0.0
8,0.588405,0.053806,0.961864,0.0
9,0.529233,0.053033,0.961864,0.0


In [15]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.562136,0.053217,0.961864,0.0
std,0.02185,0.004098,0.0,0.0


# Max pooling

In [16]:
pooling_model.pooling_type = "max"

In [17]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [18]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.548431,0.046817,0.961864,0.0
1,0.586773,0.060893,0.961864,0.0
2,0.549301,0.058623,0.961864,0.0
3,0.56801,0.045772,0.961864,0.0
4,0.550878,0.043549,0.961864,0.0
5,0.593245,0.049622,0.961864,0.0
6,0.568935,0.055731,0.961864,0.0
7,0.576114,0.052792,0.961864,0.0
8,0.625986,0.065832,0.961864,0.0
9,0.552945,0.049759,0.960452,0.0


In [19]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.572062,0.052939,0.961723,0.0
std,0.024735,0.007209,0.000447,0.0


# Without pooling

In [20]:
embeddings, targets = embed_data(sequence_encoder, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    sequence_encoder,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [21]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.553054,0.044606,0.961864,0.0
1,0.517268,0.042626,0.961864,0.0
2,0.559199,0.061615,0.961864,0.0
3,0.547343,0.047181,0.961864,0.0
4,0.502311,0.045332,0.961864,0.0
5,0.57421,0.058544,0.961864,0.0
6,0.596508,0.068455,0.961864,0.0
7,0.603687,0.05914,0.961864,0.0
8,0.548812,0.052576,0.961864,0.0
9,0.577201,0.056188,0.961864,0.0


In [22]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.557959,0.053626,0.961864,0.0
std,0.031948,0.008555,0.0,0.0


# Learnable attention


In [23]:
pooling_model.set_pooling_type("learnable_attention")

In [24]:
pooling_model.learnable_attention_matrix.load_state_dict(torch.load(path_for_attention))

<All keys matched successfully>

In [25]:
pooling_model.pooling_type

'learnable_attention'

In [26]:
embeddings, targets = embed_data(pooling_model, train + val, **cfg_validation["embed_data"])
N = len(embeddings)
indices = np.arange(N)

embeddings_test, targets_test = embed_data(
    pooling_model,
    test,
    **cfg_validation["embed_data"]
)

results = []
for i in range(cfg_validation["n_runs"]):

    bootstrap_inds = np.random.choice(indices, size=N, replace=True)
    embeddings_train, targets_train = embeddings[bootstrap_inds], targets[bootstrap_inds]

    metrics = eval_embeddings(
        embeddings_train,
        targets_train,
        embeddings_test,
        targets_test,
        cfg_validation["model"]
    )

    results.append(metrics)

res = pd.DataFrame(results)


In [27]:
res

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
0,0.601784,0.086931,0.961864,0.0
1,0.586556,0.070996,0.961864,0.0
2,0.559199,0.052149,0.961864,0.0
3,0.5964,0.057053,0.961864,0.0
4,0.577201,0.054195,0.961864,0.0
5,0.581008,0.070987,0.961864,0.0
6,0.573884,0.049603,0.961864,0.0
7,0.558329,0.047269,0.961864,0.0
8,0.61228,0.059264,0.961864,0.0
9,0.587045,0.054716,0.961864,0.0


In [28]:
res.agg(["mean", "std"])

Unnamed: 0,AUROC,PR-AUC,Accuracy,F1Score
mean,0.583369,0.060316,0.961864,0.0
std,0.017363,0.012331,0.0,0.0
