## Task 2: Transformer Encoder Model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import test
import sts_data
from importlib import reload

In [None]:
reload(sts_data)
from sts_data import STSData

columns_mapping = {
        "sent1": "sentence_A",
        "sent2": "sentence_B",
        "label": "relatedness_score",
    }
dataset_name = "sick"
sick_data = STSData(
    dataset_name=dataset_name,
    columns_mapping=columns_mapping,
    normalize_labels=True,
    normalization_const=5.0,
)
batch_size = 64
sick_dataloaders = sick_data.get_data_loader(batch_size=batch_size)

INFO:root:loading and preprocessing data...
INFO:root:reading and preprocessing data completed...
INFO:root:creating vocabulary...
INFO:torchtext.vocab:Loading vectors from .vector_cache/wiki.simple.vec.pt
INFO:root:creating vocabulary completed...
INFO:root:creating STSDataset completed...
INFO:root:creating dataloaders completed...


In [None]:
from siamese_lstm_attention import SiameseBiLSTMAttention
from train import train_model
from tuning import tune_model
from test import evaluate_test_set

In [None]:
results = tune_model(sick_data, sick_dataloaders)
for key, value in results.best_params.items():
        print("{}: {}".format(key, value))

[32m[I 2022-03-11 13:57:23,144][0m A new study created in memory with name: no-name-a6a30359-462b-45aa-9b5d-31b24f8f074f[0m
100%|██████████| 20/20 [13:19<00:00, 39.97s/it]
[32m[I 2022-03-11 14:10:42,691][0m Trial 0 finished with value: 0.5669891668969257 and parameters: {'N': 4}. Best is trial 0 with value: 0.5669891668969257.[0m
100%|██████████| 20/20 [17:59<00:00, 53.95s/it]
[32m[I 2022-03-11 14:28:41,908][0m Trial 1 finished with value: 0.551887203308708 and parameters: {'N': 6}. Best is trial 0 with value: 0.5669891668969257.[0m
100%|██████████| 20/20 [10:12<00:00, 30.60s/it]
[32m[I 2022-03-11 14:38:54,100][0m Trial 2 finished with value: 0.5803525042134443 and parameters: {'N': 3}. Best is trial 2 with value: 0.5803525042134443.[0m
100%|██████████| 20/20 [23:44<00:00, 71.23s/it]
[32m[I 2022-03-11 15:02:39,018][0m Trial 3 finished with value: 0.5570875830373602 and parameters: {'N': 8}. Best is trial 2 with value: 0.5803525042134443.[0m
100%|██████████| 20/20 [12:49<

In [None]:
## here we carry over the selected parameters from Task 1

output_size = 1
hidden_size = 32
vocab_size = len(sick_data.vocab)
embedding_size = 300
embedding_weights = sick_data.vocab.vectors
lstm_layers = 4
learning_rate = 1.0
fc_hidden_size = 128
max_epochs = 20
bidirectional = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## self attention config
self_attention_config = {
    "hidden_size": 100,  ## refers to variable 'da' in the ICLR paper
    "output_size": 10,  ## refers to variable 'r' in the ICLR paper
    "penalty":0.4,  ## refers to penalty coefficient term in the ICLR paper
}
encoder_layers = results.best_params['N']

In [None]:
## init siamese lstm
siamese_lstm_attention = SiameseBiLSTMAttention(
    encoder_layers,
    batch_size=batch_size,
    output_size=output_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size,
    embedding_size=embedding_size,
    embedding_weights=embedding_weights,
    lstm_layers=lstm_layers,
    self_attention_config=self_attention_config,
    fc_hidden_size=fc_hidden_size,
    device=device,
    bidirectional=bidirectional,
)
## move model to device
siamese_lstm_attention.to(device)
optimizer = torch.optim.Adam(params=siamese_lstm_attention.parameters())

In [None]:
tot_val_acc = train_model(
    model=siamese_lstm_attention,
    optimizer=optimizer,
    dataloader=sick_dataloaders,
    data=sick_data,
    max_epochs=max_epochs,
    config_dict={
        "device": device,
        "model_name": "siamese_lstm_attention",
        "self_attention_config": self_attention_config,
    },
)

  0%|          | 0/20 [00:00<?, ?it/s]INFO:root:Epoch 0:
INFO:root:Accuracy: 0.001103888090164734 Training Loss: 7.366766929626465
INFO:root:Evaluating accuracy on dev set
INFO:root:Train loss: 7.366766929626465 - acc: 0.001103888090164734 -- Validation loss: 0.5511758923530579 - acc: 0.031582661854148646
  5%|▌         | 1/20 [00:40<12:55, 40.81s/it]INFO:root:Epoch 1:
INFO:root:Accuracy: 0.13296299591552482 Training Loss: 4.561308860778809
INFO:root:Evaluating accuracy on dev set
INFO:root:Train loss: 4.561308860778809 - acc: 0.13296299591552482 -- Validation loss: 0.5034217238426208 - acc: 0.16804986274541814
 10%|█         | 2/20 [01:19<12:04, 40.25s/it]INFO:root:Epoch 2:
INFO:root:Accuracy: 0.30089416385245066 Training Loss: 4.2091217041015625
INFO:root:Evaluating accuracy on dev set
INFO:root:new model saved
INFO:root:Train loss: 4.2091217041015625 - acc: 0.30089416385245066 -- Validation loss: 0.46320369839668274 - acc: 0.33953204208010485
 15%|█▌        | 3/20 [01:59<11:19, 39.9

In [None]:
siamese_lstm_attention.load_state_dict(torch.load('siamese_lstm_attention.pth'))
siamese_lstm_attention.eval()
evaluate_test_set(
    model=siamese_lstm_attention,
    data_loader=sick_dataloaders,
    config_dict={
        "device": device,
        "model_name": "siamese_lstm_attention",
        "self_attention_config": self_attention_config,
    },
)

INFO:root:Evaluating accuracy on test set
Accuracy: 0.5413802089274864 Test Loss: 3.8165409564971924
