## Pytorch Lightning Sequence Classification


In [None]:
DATA_DIR = "./data"

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import numpy as np
import scipy
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data.dataset import Dataset
import argparse
import os
from pathlib import Path
from torch.optim import SGD, Adam
import pytorch_lightning as pl
from torchmetrics import Accuracy
from datetime import datetime
from pathlib import Path
from pytorch_lightning import loggers as pl_loggers
import time
from argparse import Namespace
import json
import shutil
logger = logging.getLogger(__name__)

class BaseModel(pl.LightningModule):
    def __init__(
        self,
        **config_kwargs
    ):
        """Initialize a model, tokenizer and config."""
        logger.info("Initilazing BaseModel")
        super().__init__()
        self.save_hyperparameters() #save hyperparameters to checkpoint
        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.model = self._load_model()

        self.accuracy = Accuracy()

    def _load_model(self):
        raise NotImplementedError

    def forward(self, **inputs):
        return self.model(**inputs)

    def batch2input(self, batch):
        raise NotImplementedError

    def training_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()), prog_bar=True)

        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('val_loss', loss)
        self.log('val_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()))

    def test_step(self, batch, batch_nb):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('test_loss', loss)
        self.log('test_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()))

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        # optimizer = SGD(model.parameters(), lr=self.hparams.learning_rate)
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)

        self.opt = optimizer
        return [optimizer]

    def setup(self, stage):
        if stage == "fit":
            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)

    def test_dataloader(self):
        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)

    @staticmethod
    def add_generic_args(parser, root_dir) -> None:
        parser.add_argument(
            "--max_epochs",
            default=10,
            type=int,
            help="The number of epochs to train your model.",
        )
        parser.add_argument(
            "--gpus",
            default=1,
            type=int,
            help="The number of GPUs allocated for this, it is by default 1. Set to 0 for no GPU.",
        )
        parser.add_argument(
            "--output_dir",
            default=None,
            type=str,
            required=True,
            help="The output directory where the model predictions and checkpoints will be written.",
        )
        parser.add_argument("--do_train", action="store_true", default=True, help="Whether to run training.")
        parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
        parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
        parser.add_argument(
            "--data_dir",
            default="./",
            type=str,
            help="The input data dir. Should contain the training files.",
        )
        parser.add_argument("--learning_rate", default=1e-2, type=float, help="The initial learning rate for training.")
        parser.add_argument("--num_workers", default=16, type=int, help="kwarg passed to DataLoader")
        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
        parser.add_argument("--train_batch_size", default=32, type=int)
        parser.add_argument("--eval_batch_size", default=32, type=int)

def generic_train(
    model: BaseModel,
    args: argparse.Namespace,
    early_stopping_callback=False,
    extra_callbacks=[],
    checkpoint_callback=None,
    logging_callback=None,
    **extra_train_kwargs
):

    # init model
    odir = Path(model.hparams.output_dir)
    odir.mkdir(exist_ok=True)
    log_dir = Path(os.path.join(model.hparams.output_dir, 'logs'))
    log_dir.mkdir(exist_ok=True)

    # Tensorboard logger
    pl_logger = pl_loggers.TensorBoardLogger(
        save_dir=log_dir,
        version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"),
        name="",
        default_hp_metric=True
    )

    # add custom checkpoints
    ckpt_path = os.path.join(
        args.output_dir, pl_logger.version, "checkpoints",
    )
    if checkpoint_callback is None:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=ckpt_path, filename="{epoch}-{val_acc:.2f}", monitor="val_acc", mode="max", save_top_k=1, verbose=True
        )

    train_params = {}

    train_params["max_epochs"] = args.max_epochs

    if args.gpus > 1:
        train_params["distributed_backend"] = "ddp"

    trainer = pl.Trainer.from_argparse_args(
        args,
        enable_model_summary=False,
        callbacks= [checkpoint_callback] + extra_callbacks,
        logger=pl_logger,
        **train_params,
    )

    if args.do_train:
        trainer.fit(model)
        # track model performance under differnt hparams settings in "Hparams" of TensorBoard
        pl_logger.log_hyperparams(params=model.hparams, metrics={'hp_metric': checkpoint_callback.best_model_score.item()})
        pl_logger.save()

        # save best model to `best_model.ckpt`
        target_path = os.path.join(ckpt_path, 'best_model.ckpt')
        logger.info(f"Copy best model from {checkpoint_callback.best_model_path} to {target_path}.")
        shutil.copy(checkpoint_callback.best_model_path, target_path)


    # Optionally, predict on test set and write to output_dir
    if args.do_predict:
        best_model_path = os.path.join(ckpt_path, "best_model.ckpt")
        model = model.load_from_checkpoint(best_model_path)
        return trainer.test(model)

    return trainer


In [None]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

class SST2Dataset(Dataset):
    """
    Using dataset to process input text on-the-fly
    """
    def __init__(self, vocab, data):
        self.data = data
        self.vocab = vocab
        self.max_len = 50 # assigned based on length analysis of training set

    def __getitem__(self, index):
        note = []
        label, text = int(self.data[index][0]), self.data[index][1]
        tokens = tokenizer.tokenize(text.lower())
        assert self.vocab["<pad>"] == 0 # check vocab["<pad>"] == 0
        assert self.vocab["<unk>"] == 1 # check vocab["<unk>"] == 1
        token_ids = [self.vocab.get(t, 1) for t in tokens] # if word does not exist, give <unk> token id
        length = min(len(token_ids), self.max_len) # in case token length exceed max length
        padded_token_ids = token_ids[:50] + [0] * (self.max_len - length ) # truncate or pad to max length
        mask = [1 if id!=0 else 0 for id in padded_token_ids]
        return padded_token_ids, label, length, mask

    def collate_fn(self, batch_data):
        padded_token_ids, labels, lengths, masks = list(zip(*batch_data))
        return (torch.LongTensor(padded_token_ids).view(-1, self.max_len),
                torch.FloatTensor(labels).view(-1,1),
                torch.LongTensor(lengths).view(-1,1),
                torch.FloatTensor(masks).view(-1, self.max_len)
                )

    def __len__(self):
        return len(self.data)

class LSTM_PL(BaseModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def _load_model(self):
        self.hparams.vocab = json.load(
            open(
            os.path.join(self.hparams.data_dir, self.hparams.vocab_filename)
            )
        )
        self.hparams.vocab_size = len(self.hparams.vocab)
        if self.hparams.attention:
            return LSTM_Attention(self.hparams.vocab, self.hparams.vocab_size, self.hparams.word_embedding_size, self.hparams.use_glove)
        else:
            return LSTM(self.hparams.vocab, self.hparams.vocab_size, self.hparams.word_embedding_size, self.hparams.use_glove)

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        # dataset path (change if necessary)
        datapath = os.path.join(self.hparams.data_dir, f"sst2.{type_path}")
        data = open(datapath).readlines()
        data = [d.strip().split(" ", maxsplit=1) for d in data] # list of [label, text] pair
        dataset = SST2Dataset(self.hparams.vocab, data)

        logger.info(f"Loading {type_path} data and labels from {datapath}")
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=self.hparams.num_workers,
            collate_fn=dataset.collate_fn
        )

        return data_loader

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)
        self.opt = optimizer
        return [optimizer]

    def batch2input(self, batch):
        return {"input_ids": batch[0], "labels": batch[1], "lengths": batch[2], "masks": batch[3]}

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
            "--vocab_filename",
            default=None,
            type=str,
            required=True,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--optimizer",
            default="adam",
            type=str,
            required=True,
            help="Whether to use SGD or not",
        )
        parser.add_argument(
            "--word_embedding_size",
            default=300,
            type=int,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--attention",
            action="store_true",
            help="Use attention or not",
        )
        parser.add_argument("--use_glove", action="store_true", help="Whether to use vector representaion from GloVe")

        return parser

In [None]:
class LSTM(torch.nn.Module):
    """
    LSTM Seq classification model
    """
    def __init__(self, vocab, vocab_size, word_embedding_size, use_glove=None):
        """
        # Paramters
          vocab_size: int
              size of the vocabulary.
        """
        super(LSTM, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, word_embedding_size, padding_idx=0)
        if use_glove:
            self._load_glove(vocab, word_embedding_size)

        self.device = device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.lstm   = nn.LSTM(input_size=word_embedding_size, hidden_size=word_embedding_size, num_layers=1)
        self.output = nn.Sequential(nn.Linear(word_embedding_size, 1),
                                    nn.Sigmoid())


    def _load_glove(self, vocab, word_embedding_size):
        logger.info("Load glove pretrained word embeddings")
        vectors = {}
        with open(os.path.join(DATA_DIR, "glove.small.300d.txt")) as fin:
            for line in fin:
                parts = line.split()
                vectors[parts[0]] = np.array([float(v) for v in parts[1:]])
        weights = []
        id2word = {k: w for w, k in vocab.items()}
        for i in range(len(vocab)):
            word = id2word[i]
            if word in vectors:
                weights.append(torch.from_numpy(vectors[word]))
            elif word in ["<pad>"]:
                weights.append(torch.zeros((word_embedding_size,)))
            else:
                weights.append(torch.randn((word_embedding_size,)))
        weights = torch.stack(weights).float()
        self.embedding.load_state_dict({"weight":weights})


    def forward(self, input_ids, labels, lengths, masks):
        x = torch.stack([self.embedding(input) for input in input_ids], dim=1)
        h, c  = torch.randn(1, x.size(dim=1), x.size(dim=2)).to(self.device), torch.randn(1, x.size(dim=1), x.size(dim=2)).to(self.evice)
        z, (h, c) = self.lstm(x, (h, c))

        lengths = torch.squeeze(lengths) - 1
        probs   = self.output(z[lengths, torch.arange(z.size(1))])
        loss    = F.binary_cross_entropy(probs, labels)
        predicted_labels = torch.where(probs > 0.5, 1, 0)

        return loss, predicted_labels, [] # use empty list to keep number of return tensors consistant with lstm attention

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    mock_args = f"--word_embedding_size 300 --data_dir {DATA_DIR} --output_dir lstm --optimizer adam \
    --vocab_filename unigram_vocab.json --learning_rate 0.001 --max_epochs 10 --do_predict \
    --train_batch_size 16 --use_glove"

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()


02/18/2022 06:51:07 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
02/18/2022 06:51:07 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 06:51:08 - INFO - __main__ -   Load glove pretrained word embeddings


Namespace(attention=False, data_dir='./data', do_predict=True, do_train=True, eval_batch_size=32, gpus=1, learning_rate=0.001, max_epochs=10, num_workers=16, optimizer='adam', output_dir='lstm', seed=42, train_batch_size=16, use_glove=True, vocab_filename='unigram_vocab.json', word_embedding_size=300)


02/18/2022 06:51:10 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
02/18/2022 06:51:10 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
02/18/2022 06:51:10 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
02/18/2022 06:51:10 - INFO - __main__ -   Loading train data and labels from ./data/sst2.train
  cpuset_checked))
02/18/2022 06:51:10 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

02/18/2022 06:51:10 - INFO - __main__ -   Loading dev data and labels from ./data/sst2.dev
02/18/2022 06:51:10 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

02/18/2022 06:51:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 432: val_acc reached 0.79128 (best 0.79128), saving model to "/content/lstm/version_18-02-2022--06-51-10/checkpoints/epoch=0-val_acc=0.79.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:51:50 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 865: val_acc reached 0.82339 (best 0.82339), saving model to "/content/lstm/version_18-02-2022--06-51-10/checkpoints/epoch=1-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:52:09 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 1298: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:52:29 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 1731: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:52:48 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 2164: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:53:07 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 2597: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:53:25 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 3030: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:53:45 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 3463: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:54:04 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 3896: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:54:23 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 4329: val_acc was not in top 1
02/18/2022 06:54:24 - INFO - __main__ -   Copy best model from /content/lstm/version_18-02-2022--06-51-10/checkpoints/epoch=1-val_acc=0.82.ckpt to lstm/version_18-02-2022--06-51-10/checkpoints/best_model.ckpt.
02/18/2022 06:54:24 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 06:54:24 - INFO - __main__ -   Load glove pretrained word embeddings
02/18/2022 06:54:26 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
02/18/2022 06:54:26 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8319604396820068, 'test_loss': 0.393383651971817}
--------------------------------------------------------------------------------


In [None]:
class LSTM_Attention(torch.nn.Module):
    """
    LSTM with Attention Seq classification model
    """
    def __init__(self, vocab, vocab_size, word_embedding_size, use_glove=None):
        super(LSTM_Attention, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, word_embedding_size, padding_idx=0)
        if use_glove:
            self._load_glove(vocab, word_embedding_size)

        # attention
        self.lam     = 3
        self.alpha   = nn.Linear(word_embedding_size, 1, bias=True)

        # lstm
        self.num_hidden = 1
        self.lstm   = nn.LSTM(input_size=word_embedding_size, hidden_size=word_embedding_size, num_layers=self.num_hidden)
        self.output = nn.Sequential(nn.Linear(word_embedding_size, 1), nn.Sigmoid())

    def _load_glove(self, vocab, word_embedding_size):
        logger.info("Load glove pretrained word embeddings")
        vectors = {}
        with open(os.path.join(DATA_DIR, "glove.small.300d.txt")) as fin:
            for line in fin:
                parts = line.split()
                vectors[parts[0]] = np.array([float(v) for v in parts[1:]])
        weights = []
        id2word = {k: w for w, k in vocab.items()}
        for i in range(len(vocab)):
            word = id2word[i]
            if word in vectors:
                weights.append(torch.from_numpy(vectors[word]))
            elif word in ["<pad>"]:
                weights.append(torch.zeros((word_embedding_size,)))
            else:
                weights.append(torch.randn((word_embedding_size,)))
        weights = torch.stack(weights).float()
        self.embedding.load_state_dict({"weight":weights})


    def forward(self, input_ids, labels, lengths, masks):
        x = self.embedding(input_ids.permute(1,0))
        h, c  = torch.randn(1, x.size(dim=1), x.size(dim=2)).cuda(), torch.randn(1, x.size(dim=1), x.size(dim=2)).cuda()
        z, (h, c) = self.lstm(x, (h, c))

        alpha = self.alpha(z) / self.lam
        pad   = torch.full_like(alpha, -1e9, dtype=torch.float)
        masks = masks.permute(1,0).unsqueeze(-1)
        att   = F.softmax(torch.where(masks == 1, alpha, pad), dim=0)
        z     = torch.sum(att * z, dim=0)

        probs   = self.output(z)
        loss    = F.binary_cross_entropy(probs, labels)
        predicted_labels = torch.where(probs > 0.5, 1, 0)
        weights = torch.squeeze(att).permute(1,0).tolist()

        return loss, predicted_labels, weights

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    mock_args = f"--word_embedding_size 300 --data_dir {DATA_DIR} --output_dir lstm-att --optimizer adam \
    --vocab_filename unigram_vocab.json --learning_rate 0.001 --max_epochs 10 --do_predict --attention --use_glove \
    --train_batch_size 16"

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()


02/18/2022 06:56:02 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
02/18/2022 06:56:02 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 06:56:02 - INFO - __main__ -   Load glove pretrained word embeddings


Namespace(attention=True, data_dir='./data', do_predict=True, do_train=True, eval_batch_size=32, gpus=1, learning_rate=0.001, max_epochs=10, num_workers=16, optimizer='adam', output_dir='lstm-att', seed=42, train_batch_size=16, use_glove=True, vocab_filename='unigram_vocab.json', word_embedding_size=300)


02/18/2022 06:56:04 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
02/18/2022 06:56:04 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
02/18/2022 06:56:04 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
02/18/2022 06:56:04 - INFO - __main__ -   Loading train data and labels from ./data/sst2.train
  cpuset_checked))
02/18/2022 06:56:04 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

02/18/2022 06:56:05 - INFO - __main__ -   Loading dev data and labels from ./data/sst2.dev
02/18/2022 06:56:06 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

02/18/2022 06:56:26 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 432: val_acc reached 0.81307 (best 0.81307), saving model to "/content/lstm-att/version_18-02-2022--06-56-04/checkpoints/epoch=0-val_acc=0.81.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:56:46 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 865: val_acc reached 0.83372 (best 0.83372), saving model to "/content/lstm-att/version_18-02-2022--06-56-04/checkpoints/epoch=1-val_acc=0.83.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:57:05 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 1298: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:57:24 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 1731: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:57:43 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 2164: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:58:03 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 2597: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:58:21 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 3030: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:58:40 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 3463: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:58:58 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 3896: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 06:59:16 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 4329: val_acc was not in top 1
02/18/2022 06:59:17 - INFO - __main__ -   Copy best model from /content/lstm-att/version_18-02-2022--06-56-04/checkpoints/epoch=1-val_acc=0.83.ckpt to lstm-att/version_18-02-2022--06-56-04/checkpoints/best_model.ckpt.
02/18/2022 06:59:17 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 06:59:18 - INFO - __main__ -   Load glove pretrained word embeddings
02/18/2022 06:59:19 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
02/18/2022 06:59:20 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8456891775131226, 'test_loss': 0.3673805892467499}
--------------------------------------------------------------------------------


In [None]:
%reload_ext  tensorboard
%tensorboard --logdir lstm/

# Visualize Attention Weights

In [None]:
from IPython.display import HTML, display
def visualize_attention_weights(tokens, att_weights):

    html_template = """<span style="background-color:rgb(255, {}, {})">{}</span>"""
    out = []
    for t, w in zip(tokens, att_weights):
        rgb = 255 - w*255
        out.append(html_template.format(rgb,rgb,t))
    html = " ".join(out)
    display(HTML(html), metadata=dict(isolated=True))

tokens = ["this", "is", "good"]
att_weights = [0.1, 0.2, 0.7]
visualize_attention_weights(tokens, att_weights)

In [None]:
### Basic LSTM: Error Analysis
model = LSTM_PL.load_from_checkpoint(DATA_DIR + "/../lstm/version_18-02-2022--06-51-10/checkpoints/best_model.ckpt")
model.to('cuda')
test_loader = model.test_dataloader()
vocab_filepath = Path(DATA_DIR).joinpath('sst2.test')
test_data = open(vocab_filepath).readlines()

test_corpus = open(Path(DATA_DIR).joinpath('sst2.test')).readlines()
tk = WordPunctTokenizer()

correct_text, correct_label, incorrect_text, incorrect_label = [], [], [], []
for i, batch in enumerate(test_loader):
  batch_size = len(batch)
  texts, labels = [], []
  for j in range(i*batch_size, (i+1)*batch_size):
    tokens = tk.tokenize(test_corpus[j])
    labels.append(int(tokens[0]))
    texts.append(tokens[1:])

  batch = model.transfer_batch_to_device(batch, device='cuda', dataloader_idx=0)
  input = model.batch2input(batch)

  loss, pred, att_weights = model(**input)
  pred = torch.squeeze(pred).tolist()

  for k, (prediction, label) in enumerate(zip(pred, labels)):
    if int(prediction) == label:
      correct_text.append(texts[k])
      correct_label.append((prediction, label))
    elif int(prediction) != label:
      incorrect_text.append(texts[k])
      incorrect_label.append((prediction, label))

print("\n--- Displaying a set of correct classifications ---")
for i, (text, label) in enumerate(zip(correct_text, correct_label)):
  print("Prediction:", label[0], "Label:", label[1], "Text:", " ".join(text))

print("\n\n--- Displaying a set of incorrect classifications ---")
for i, (text, label) in enumerate(zip(incorrect_text, incorrect_label)):
  print("Prediction:", label[0], "Label:", label[1], "Text:", " ".join(text))

02/18/2022 07:13:34 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 07:13:35 - INFO - __main__ -   Load glove pretrained word embeddings
02/18/2022 07:13:37 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test
  cpuset_checked))



--- Displaying a set of correct classifications ---
Prediction: 0 Label: 0 Text: no movement , no yuks , not much of anything .
Prediction: 0 Label: 0 Text: a gob of drivel so sickly sweet , even the eager consumers of moore ' s pasteurized ditties will retch it up like rancid crème brûlée .
Prediction: 0 Label: 0 Text: gangs of new york is an unapologetic mess , whose only saving grace is that it ends by blowing just about everything up .
Prediction: 0 Label: 0 Text: we never really feel involved with the story , as all of its ideas remain just that : abstract ideas .
Prediction: 1 Label: 1 Text: this is one of polanski ' s best films .
Prediction: 1 Label: 1 Text: take care of my cat offers a refreshingly different slice of asian cinema .
Prediction: 1 Label: 1 Text: the movie exists for its soccer action and its fine acting .
Prediction: 1 Label: 1 Text: jason x has cheesy effects and a hoary plot , but its macabre , self - deprecating sense of humor makes up for a lot .
Prediction

In [None]:
## LSTM Attention: Error Analysis & Weights Visualization
model = LSTM_PL.load_from_checkpoint(DATA_DIR + "/../lstm-att/version_18-02-2022--06-56-04/checkpoints/best_model.ckpt")
model.to('cuda')
test_loader = model.test_dataloader()
vocab_filepath = Path(DATA_DIR).joinpath('sst2.test')
test_data = open(vocab_filepath).readlines()

test_corpus = open(Path(DATA_DIR).joinpath('sst2.test')).readlines()
tk = WordPunctTokenizer()
print(len(test_corpus))
correct_text, correct_label, incorrect_text, incorrect_label = [], [], [], []
correct_att, incorrect_att = [], []
for i, batch in enumerate(test_loader):
  batch_size = len(batch)
  texts, labels = [], []
  for j in range(i*batch_size, (i+1)*batch_size):
    tokens = tk.tokenize(test_corpus[j])
    labels.append(int(tokens[0]))
    texts.append(tokens[1:])

  batch = model.transfer_batch_to_device(batch, device='cuda', dataloader_idx=0)
  input = model.batch2input(batch)
  loss, pred, att_weights = model(**input)
  pred = torch.squeeze(pred).tolist()

  for k, (prediction, label) in enumerate(zip(pred, labels)):
    if len(correct_text) >= 20 and len(incorrect_text) >= 20:
      break
    if (int(prediction) == label) and (len(correct_text) < 20) and (max(att_weights[k]) > 0.05):
      correct_text.append(texts[k])
      correct_label.append((prediction, label))
      correct_att.append(att_weights[k])
    elif (int(prediction) != label) and (len(correct_text) < 20): # and (max(att_weights[k]) > 0.01):
      incorrect_text.append(texts[k])
      incorrect_label.append((prediction, label))
      incorrect_att.append(att_weights[k])
  if len(correct_text) >= 20 and len(incorrect_text) >= 20:
      break
print(i)
print(len(incorrect_text))

print("\n--- Displaying a set of correct classifications ---")
for text, label, att in zip(correct_text, correct_label, correct_att):
  print("Prediction:", label[0], "Label:", label[1], "Max Weight:", max(att), "Text: ", end='')
  visualize_attention_weights(text, att)

print("\n--- Displaying a set of incorrect classifications ---")
for text, label, att in zip(incorrect_text, incorrect_label, incorrect_att):
  print("Prediction:", label[0], "Label:", label[1], "Max Weight:", max(att), "Text: ", end='')
  visualize_attention_weights(text, att)

02/18/2022 07:30:31 - INFO - __main__ -   Initilazing BaseModel
02/18/2022 07:30:31 - INFO - __main__ -   Load glove pretrained word embeddings
02/18/2022 07:30:33 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test


1821


  cpuset_checked))


56
10

--- Displaying a set of correct classifications ---
Prediction: 0 Label: 0 Max Weight: 0.11975805461406708 Text: 

Prediction: 0 Label: 0 Max Weight: 0.08515149354934692 Text: 

Prediction: 0 Label: 0 Max Weight: 0.15054580569267273 Text: 

Prediction: 0 Label: 0 Max Weight: 0.06014563515782356 Text: 

Prediction: 1 Label: 1 Max Weight: 0.06562557816505432 Text: 

Prediction: 1 Label: 1 Max Weight: 0.12106537073850632 Text: 

Prediction: 1 Label: 1 Max Weight: 0.09839078783988953 Text: 

Prediction: 0 Label: 0 Max Weight: 0.1800827980041504 Text: 

Prediction: 0 Label: 0 Max Weight: 0.08049792051315308 Text: 

Prediction: 0 Label: 0 Max Weight: 0.09254102408885956 Text: 

Prediction: 1 Label: 1 Max Weight: 0.10488513112068176 Text: 

Prediction: 0 Label: 0 Max Weight: 0.06319135427474976 Text: 

Prediction: 0 Label: 0 Max Weight: 0.1626814603805542 Text: 

Prediction: 0 Label: 0 Max Weight: 0.07029739767313004 Text: 

Prediction: 1 Label: 1 Max Weight: 0.23784023523330688 Text: 

Prediction: 1 Label: 1 Max Weight: 0.1549735814332962 Text: 

Prediction: 0 Label: 0 Max Weight: 0.14054380357265472 Text: 

Prediction: 1 Label: 1 Max Weight: 0.10787223279476166 Text: 

Prediction: 1 Label: 1 Max Weight: 0.0704021081328392 Text: 

Prediction: 1 Label: 1 Max Weight: 0.15215419232845306 Text: 


--- Displaying a set of incorrect classifications ---
Prediction: 1 Label: 0 Max Weight: 0.171137273311615 Text: 

Prediction: 1 Label: 0 Max Weight: 0.08551198989152908 Text: 

Prediction: 1 Label: 0 Max Weight: 0.060952041298151016 Text: 

Prediction: 0 Label: 1 Max Weight: 0.06618083268404007 Text: 

Prediction: 0 Label: 1 Max Weight: 0.0944661870598793 Text: 

Prediction: 0 Label: 1 Max Weight: 0.08965706825256348 Text: 

Prediction: 1 Label: 0 Max Weight: 0.07010600715875626 Text: 

Prediction: 1 Label: 0 Max Weight: 0.2295592725276947 Text: 

Prediction: 1 Label: 0 Max Weight: 0.14127984642982483 Text: 

Prediction: 1 Label: 0 Max Weight: 0.12827670574188232 Text: 

# BERT

https://github.com/huggingface/transformers/tree/master/examples/distillation .

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

class BERTSST2Dataset(Dataset):
    """
    Using dataset to process input text on-the-fly
    """
    def __init__(self, tokenizer, data):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = 50 # assigned based on length analysis of training set

    def __getitem__(self, index):
        note = []
        label, text = int(self.data[index][0]), self.data[index][1]
        return text, label

    def collate_fn(self, batch_data):
        texts, labels = list(zip(*batch_data))
        # print(text)
        encodings = self.tokenizer(list(texts), padding=True, truncation=True, max_length=self.max_len, return_tensors= 'pt')
        return (
                encodings['input_ids'],
                encodings['attention_mask'],
                torch.LongTensor(labels).view(-1,1)
               )

    def __len__(self):
        return len(self.data)

class BERT_PL(BaseModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_name)

    def _load_model(self):
        model_config = AutoConfig.from_pretrained(
            self.hparams.model_name,
            num_labels=2,
        )
        return AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name, config=model_config)

    def forward(self, **args):
        outputs = self.model(**args)
        loss, logits = outputs[0], outputs[1]
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, predicted_labels, []

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        datapath = os.path.join(self.hparams.data_dir, f"sst2.{type_path}")
        data = open(datapath).readlines()
        data = [d.strip().split(" ", maxsplit=1) for d in data] # list of [label, text] pair
        dataset = BERTSST2Dataset(self.tokenizer, data)

        logger.info(f"Loading {type_path} data and labels from {datapath}")
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=self.hparams.num_workers,
            collate_fn=dataset.collate_fn
        )

        return data_loader

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)
        self.opt = optimizer
        return [optimizer]

    def batch2input(self, batch):
        return {"input_ids": batch[0], "labels": batch[2], "attention_mask": batch[1]}

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
            "--model_name",
            default=None,
            type=str,
            required=True,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--optimizer",
            default="adam",
            type=str,
            required=True,
            help="Whether to use SGD or not",
        )
        return parser

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    mock_args = f"--data_dir {DATA_DIR} --output_dir bert --optimizer adam \
    --model_name distilbert-base-uncased --learning_rate 0.00005 --max_epochs 3 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = BERT_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = BERT_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

02/18/2022 07:33:02 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
02/18/2022 07:33:02 - INFO - __main__ -   Initilazing BaseModel


Namespace(data_dir='./data', do_predict=True, do_train=True, eval_batch_size=32, gpus=1, learning_rate=5e-05, max_epochs=3, model_name='distilbert-base-uncased', num_workers=16, optimizer='adam', output_dir='bert', seed=42, train_batch_size=32)


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

02/18/2022 07:33:25 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
02/18/2022 07:33:25 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
02/18/2022 07:33:25 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
02/18/2022 07:33:25 - INFO - __main__ -   Loading train data and labels from ./data/sst2.train
  cpuset_checked))
02/18/2022 07:33:25 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

02/18/2022 07:33:25 - INFO - __main__ -   Loading dev data and labels from ./data/sst2.dev
02/18/2022 07:33:26 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

02/18/2022 07:34:32 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 216: val_acc reached 0.89794 (best 0.89794), saving model to "/content/bert/version_18-02-2022--07-33-25/checkpoints/epoch=0-val_acc=0.90.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 07:35:44 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 433: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/18/2022 07:36:51 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 650: val_acc was not in top 1
02/18/2022 07:36:52 - INFO - __main__ -   Copy best model from /content/bert/version_18-02-2022--07-33-25/checkpoints/epoch=0-val_acc=0.90.ckpt to bert/version_18-02-2022--07-33-25/checkpoints/best_model.ckpt.
02/18/2022 07:36:55 - INFO - __main__ -   Initilazing BaseModel
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSe

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8857770562171936, 'test_loss': 0.2956537902355194}
--------------------------------------------------------------------------------


In [None]:
%reload_ext  tensorboard
%tensorboard --logdir bert/