<a href="https://colab.research.google.com/github/satyajitghana/TSAI-DeepNLP-END2.0/blob/main/07_Seq2Seq/SST_Redo/SSTModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

Thu Jun 24 20:07:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! pip install pytorch-lightning --quiet
! pip install OmegaConf --quiet
! pip install nlpaug --quiet
! pip install gdown==3.13.0
! pip install spacy==3.0.6 --quiet

In [None]:
! pip install torch==1.8.1+cu102 torchtext==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
import copy

import torch
import torchtext
import pytorch_lightning as pl

from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from pytorch_lightning.metrics.functional import accuracy

from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.experimental.functional import sequential_transforms, ngrams_func, totensor, vocab_func
from torchtext.vocab import build_vocab_from_iterator

import torchtext.experimental.functional as text_f

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

import random
import gdown

import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from pathlib import Path
from omegaconf import OmegaConf
from zipfile import ZipFile

from typing import Optional, Tuple, Any, Dict, List

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.style.use("dark_background")

  "`pytorch_lightning.metrics.*` module has been renamed to `torchmetrics.*` and split off to its own package"


In [None]:
class StanfordSentimentTreeBank(Dataset):
    """The Standford Sentiment Tree Bank Dataset
    Stanford Sentiment Treebank V1.0

    This is the dataset of the paper:

    Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
    Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts
    Conference on Empirical Methods in Natural Language Processing (EMNLP 2013)

    If you use this dataset in your research, please cite the above paper.

    @incollection{SocherEtAl2013:RNTN,
    title = {{Parsing With Compositional Vector Grammars}},
    author = {Richard Socher and Alex Perelygin and Jean Wu and Jason Chuang and Christopher Manning and Andrew Ng and Christopher Potts},
    booktitle = {{EMNLP}},
    year = {2013}
    }
    """

    ORIG_URL = "http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip"
    DATASET_NAME = "StanfordSentimentTreeBank"
    URL = 'https://drive.google.com/uc?id=1urNi0Rtp9XkvkxxeKytjl1WoYNYUEoPI'
    OUTPUT = 'sst_dataset.zip'
 

    def __init__(self, root, vocab=None, text_transforms=None, label_transforms=None, split='train', ngrams=1, use_augmented_dataset=False):
        """Initiate text-classification dataset.
        Args:
            data: a list of label and text tring tuple. label is an integer.
                [(label1, text1), (label2, text2), (label2, text3)]
            vocab: Vocabulary object used for dataset.
            transforms: a tuple of label and text string transforms.
        """

        super(self.__class__, self).__init__()

        self.use_augmented = use_augmented_dataset

        if split not in ['train', 'test']:
            raise ValueError(f'split must be either ["train", "test"] unknown split {split}')

        self.vocab = vocab

        gdown.cached_download(self.URL, Path(root) / self.OUTPUT)

        self.generate_sst_dataset(split, Path(root) / self.OUTPUT)

        tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

        # the text transform can only work at the sentence level
        # the rest of tokenization and vocab is done by this class
        self.text_transform = sequential_transforms(tokenizer, text_f.ngrams_func(ngrams))

        def build_vocab(data, transforms):
            def apply_transforms(data):
                for line in data:
                    yield transforms(line)
            return build_vocab_from_iterator(apply_transforms(data), len(data))

        if self.vocab is None:
            # vocab is always built on the train dataset
            self.vocab = build_vocab(self.dataset_train["phrase"], self.text_transform)


        if text_transforms is not None:
            self.text_transform = sequential_transforms(
                self.text_transform, text_transforms, text_f.vocab_func(self.vocab), text_f.totensor(dtype=torch.long)
            )
        else:
            self.text_transform = sequential_transforms(
                self.text_transform, text_f.vocab_func(self.vocab), text_f.totensor(dtype=torch.long)
            )

        self.label_transform = sequential_transforms(text_f.totensor(dtype=torch.long))

    def generate_sst_dataset(self, split, dataset_file):

        with ZipFile(dataset_file) as datasetzip:
            with datasetzip.open('sst_dataset/sst_dataset_augmented.csv' if self.use_augmented else 'sst_dataset/sst_dataset_cleaned.csv') as f:
                dataset = pd.read_csv(f, index_col=0)

        self.dataset_orig = dataset.copy()

        dataset_train_raw = dataset[dataset['splitset_label'].isin([1, 3])]
        self.dataset_train = pd.concat([
                dataset_train_raw[['phrase_cleaned', 'sentiment_values']].rename(columns={"phrase_cleaned": 'phrase'}),
        ], ignore_index=True)

        if split == 'train':
            self.dataset = self.dataset_train.copy()
        else:
            self.dataset = dataset[dataset['splitset_label'].isin([2])] \
                                    [['phrase_cleaned', 'sentiment_values']] \
                                    .rename(columns={"phrase_cleaned": 'phrase'}) \
                                    .reset_index(drop=True)

    @staticmethod
    def discretize_label(label):
        if label <= 0.2: return 0
        if label <= 0.4: return 1
        if label <= 0.6: return 2
        if label <= 0.8: return 3
        return 4

    def __getitem__(self, idx):
        # print(f'text: {self.dataset["sentence"].iloc[idx]}, label: {self.dataset["sentiment_values"].iloc[idx]}')
        text = self.text_transform(self.dataset['phrase'].iloc[idx])
        label = self.label_transform(self.dataset['sentiment_values'].iloc[idx])
        # print(f't_text: {text} {text.shape}, t_label: {label}')
        return label, text 

    def __len__(self):
        return len(self.dataset)

    @staticmethod
    def get_labels():
        return ['very negative', 'negative', 'neutral', 'positive', 'very positive']

    def get_vocab(self):
        return self.vocab

    @property
    def collator_fn(self):
        def collate_fn(batch):
            pad_idx = self.get_vocab()['<pad>']
            
            labels, sequences = zip(*batch)

            labels = torch.stack(labels)

            lengths = torch.LongTensor([len(sequence) for sequence in sequences])

            # print('before padding: ', sequences[40])
            
            sequences = torch.nn.utils.rnn.pad_sequence(sequences, 
                                                        padding_value = pad_idx,
                                                        batch_first=True
                                                        )
            # print('after padding: ', sequences[40])
                    
            return labels, sequences, lengths
        
        return collate_fn

In [None]:
class SSTDataModule(pl.LightningDataModule):
    """
    DataModule for SST, train, val, test splits and transforms
    """

    name = "stanford_sentiment_treebank"

    def __init__(
        self,
        data_dir: str = '.',
        val_split: int = 1000,
        num_workers: int = 2,
        batch_size: int = 64,
        *args,
        **kwargs,
    ):
        """
        Args:
            data_dir: where to save/load the data
            val_split: how many of the training images to use for the validation split
            num_workers: how many workers to use for loading data
            normalize: If true applies image normalize
            batch_size: desired batch size.
        """
        super().__init__(*args, **kwargs)

        self.data_dir = data_dir
        self.val_split = val_split
        self.num_workers = num_workers
        self.batch_size = batch_size

        self.dataset_train = ...
        self.dataset_val = ...
        self.dataset_test = ...

        self.SST = StanfordSentimentTreeBank

    def prepare_data(self):
        """Saves IMDB files to `data_dir`"""
        self.SST(self.data_dir)

    def setup(self, stage: Optional[str] = None):
        """Split the train and valid dataset"""

        # train_trans, test_trans = self.default_transforms

        train_dataset = self.SST(self.data_dir, split='train')
        test_dataset = self.SST(self.data_dir, split='test')

        train_length = len(train_dataset)

        self.raw_dataset_train = train_dataset
        self.raw_dataset_test = test_dataset

        # self.dataset_train, self.dataset_val = random_split(train_dataset, [train_length - self.val_split, self.val_split])
        self.dataset_train = train_dataset
        self.dataset_test = test_dataset

    def train_dataloader(self):
        """IMDB train set removes a subset to use for validation"""
        loader = DataLoader(
            self.dataset_train,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def val_dataloader(self):
        """IMDB val set uses a subset of the training set for validation"""
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def test_dataloader(self):
        """IMDB test set uses the test split"""
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def get_vocab(self):
        return self.raw_dataset_train.get_vocab()

    @property
    def collator_fn(self):
        return self.raw_dataset_train.collator_fn

In [None]:
class SSTModel(pl.LightningModule):

    def __init__(self, hparams, *args, **kwargs):
        super().__init__()

        self.save_hyperparameters(hparams)

        self.num_classes = self.hparams.output_dim

        self.embedding = nn.Embedding(self.hparams.input_dim, self.hparams.embedding_dim)

        self.lstm = nn.LSTM(
            self.hparams.embedding_dim, 
            self.hparams.hidden_dim, 
            num_layers=self.hparams.num_layers,
            dropout=self.hparams.dropout,
            batch_first=True
        )

        self.proj_layer = nn.Sequential(
            nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim),
            nn.BatchNorm1d(self.hparams.hidden_dim),
            nn.ReLU(),
            nn.Dropout(self.hparams.dropout),
        )

        self.fc = nn.Linear(self.hparams.hidden_dim, self.num_classes)

        self.loss = nn.CrossEntropyLoss(ignore_index = self.hparams.pad_idx)

    def init_state(self, sequence_length):
        return (torch.zeros(self.hparams.num_layers, sequence_length, self.hparams.hidden_dim).to(self.device),
                torch.zeros(self.hparams.num_layers, sequence_length, self.hparams.hidden_dim).to(self.device))

    def forward(self, text, text_length, prev_state=None):
        # [batch size, sentence length] => [batch size, sentence len, embedding size]
        embedded = self.embedding(text)

        # packs the input for faster forward pass in RNN
        packed = torch.nn.utils.rnn.pack_padded_sequence(
            embedded, text_length.to('cpu'), 
            enforce_sorted=False, 
            batch_first=True
        )
        
        # [batch size sentence len, embedding size] => 
        #   output: [batch size, sentence len, hidden size]
        #   hidden: [batch size, 1, hidden size]
        packed_output, curr_state = self.lstm(packed, prev_state)

        hidden_state, cell_state = curr_state

        # print('hidden state shape: ', hidden_state.shape)
        # print('cell')

        # unpack packed sequence
        # unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)

        # print('unpacked: ', unpacked.shape)

        # [batch size, sentence len, hidden size] => [batch size, num classes]
        # output = self.proj_layer(unpacked[:, -1])
        output = self.proj_layer(hidden_state[-1])

        # print('output shape: ', output.shape)

        output = self.fc(output)

        return output, curr_state

    def shared_step(self, batch, batch_idx):
        label, text, text_length = batch

        logits, in_state = self(text, text_length)
        
        loss = self.loss(logits, label)

        pred = torch.argmax(F.log_softmax(logits, dim=1), dim=1)
        acc = accuracy(pred, label)

        metric = {'loss': loss, 'acc': acc} 
        
        return metric


    def training_step(self, batch, batch_idx):
        metrics = self.shared_step(batch, batch_idx)

        log_metrics = {'train_loss': metrics['loss'], 'train_acc': metrics['acc']}

        self.log_dict(log_metrics, prog_bar=True)

        return metrics


    def validation_step(self, batch, batch_idx):
        metrics = self.shared_step(batch, batch_idx)

        return metrics
    

    def validation_epoch_end(self, outputs):
        acc = torch.stack([x['acc'] for x in outputs]).mean()
        loss = torch.stack([x['loss'] for x in outputs]).mean()

        log_metrics = {'val_loss': loss, 'val_acc': acc}

        print(f'Epoch: {self.current_epoch}, Test Acc: {acc}, Test Loss: {loss}')

        self.log_dict(log_metrics, prog_bar=True)

        return log_metrics


    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def test_epoch_end(self, outputs):
        accuracy = torch.stack([x['acc'] for x in outputs]).mean()

        self.log('hp_metric', accuracy)

        self.log_dict({'test_acc': accuracy}, prog_bar=True)


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        lr_scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, verbose=True),
            'monitor': 'train_loss',
            'name': 'scheduler'
        }
        return [optimizer], [lr_scheduler]


**Sanity Checking**



In [None]:
sst_dataset = SSTDataModule(batch_size=128)
sst_dataset.setup()

File exists: sst_dataset.zip


100%|██████████| 9161/9161 [00:01<00:00, 7989.21lines/s]


File exists: sst_dataset.zip


100%|██████████| 9161/9161 [00:01<00:00, 8032.12lines/s]


In [None]:
f'Train Length: {len(sst_dataset.train_dataloader()) * 128}, Test Length: {len(sst_dataset.test_dataloader()) * 128}'

'Train Length: 9216, Test Length: 2176'

In [None]:
loader = sst_dataset.train_dataloader()

In [None]:
batch = next(iter(loader))

In [None]:
label, text, text_length = batch

In [None]:
text.size(0)

128

In [None]:
label.shape, text.shape, text_length.shape

(torch.Size([128]), torch.Size([128, 46]), torch.Size([128]))

In [None]:
text[0]

tensor([   15, 14469,     6, 14677,     5,  1058,    11,  4473,   686,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1])

In [None]:
hparams = OmegaConf.create({
    'input_dim': len(sst_dataset.get_vocab()),
    'embedding_dim': 128,
    'num_layers': 2,
    'hidden_dim': 64,
    'dropout': 0.5,
    'output_dim': len(StanfordSentimentTreeBank.get_labels()),
    'pad_idx': sst_dataset.get_vocab().stoi['<pad>'],
    'lr': 5e-4,
    'epochs': 30,
    'use_lr_finder': False
})

In [None]:
sst_model = SSTModel(hparams)

In [None]:
output, (h, c) = sst_model(text, text_length)

In [None]:
output.shape

torch.Size([128, 5])

In [None]:
sst_model = SSTModel(hparams)

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    save_top_k=3,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

In [None]:
trainer = pl.Trainer(gpus=1, max_epochs=hparams.epochs, callbacks=[lr_monitor, checkpoint_callback], progress_bar_refresh_rate=1, reload_dataloaders_every_epoch=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
if hparams.use_lr_finder:
    # Run learning rate finder
    lr_finder = trainer.tuner.lr_find(sst_model, sst_dataset, max_lr=5)

    # Plot with
    fig = lr_finder.plot(suggest=True)
    fig.show()

    # Pick point based on plot, or get suggestion
    new_lr = lr_finder.suggestion()

    print(f'lr finder suggested lr: {new_lr}')

    # update hparams of the model
    sst_model.hparams.lr = new_lr

In [None]:
trainer.fit(sst_model, sst_dataset)

File exists: sst_dataset.zip


100%|██████████| 9161/9161 [00:01<00:00, 8103.13lines/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | embedding  | Embedding        | 2.2 M 
1 | lstm       | LSTM             | 82.9 K
2 | proj_layer | Sequential       | 4.3 K 
3 | fc         | Linear           | 325   
4 | loss       | CrossEntropyLoss | 0     
------------------------------------------------
2.3 M     Trainable params
0         Non-trainable params
2.3 M     Total params
9.307     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Epoch: 0, Test Acc: 0.1328125, Test Loss: 1.6321697235107422


  stream(template_mgs % msg_args)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 0, Test Acc: 0.2280784547328949, Test Loss: 1.4922568798065186


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 1, Test Acc: 0.2326740324497223, Test Loss: 1.4437384605407715


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 2, Test Acc: 0.23666085302829742, Test Loss: 1.422459363937378


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 3, Test Acc: 0.24293950200080872, Test Loss: 1.404987096786499


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 4, Test Acc: 0.249832883477211, Test Loss: 1.395544171333313


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 5, Test Acc: 0.2535093426704407, Test Loss: 1.3994262218475342


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 6, Test Acc: 0.2584153115749359, Test Loss: 1.4065455198287964


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 7, Test Acc: 0.2509012222290039, Test Loss: 1.3632327318191528


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 8, Test Acc: 0.2657622694969177, Test Loss: 1.382830262184143


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 9, Test Acc: 0.2766365110874176, Test Loss: 1.3973437547683716


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 10, Test Acc: 0.2795490324497223, Test Loss: 1.5025354623794556


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 11, Test Acc: 0.29301947355270386, Test Loss: 1.3790276050567627


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 12, Test Acc: 0.2876599431037903, Test Loss: 1.445764422416687


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 13, Test Acc: 0.2718857526779175, Test Loss: 1.649567723274231


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 14, Test Acc: 0.2856665253639221, Test Loss: 1.7272247076034546


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 15, Test Acc: 0.29959654808044434, Test Loss: 1.6855326890945435


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 16, Test Acc: 0.2838282883167267, Test Loss: 1.9708813428878784


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 17, Test Acc: 0.2920944094657898, Test Loss: 1.9789706468582153


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 18, Test Acc: 0.28826871514320374, Test Loss: 2.264657735824585


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 19, Test Acc: 0.28964143991470337, Test Loss: 2.3266751766204834


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 20, Test Acc: 0.29592007398605347, Test Loss: 2.2889909744262695


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 21, Test Acc: 0.2861260771751404, Test Loss: 2.5443098545074463


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 22, Test Acc: 0.2858157455921173, Test Loss: 2.618633985519409


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 23, Test Acc: 0.29700031876564026, Test Loss: 2.751962900161743


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 24, Test Acc: 0.29087090492248535, Test Loss: 2.886303663253784


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 25, Test Acc: 0.2911752760410309, Test Loss: 2.913256883621216


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 26, Test Acc: 0.28995177149772644, Test Loss: 3.2125508785247803


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 27, Test Acc: 0.28367310762405396, Test Loss: 3.3605377674102783


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 28, Test Acc: 0.2810649871826172, Test Loss: 3.2784640789031982


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 29, Test Acc: 0.28428784012794495, Test Loss: 3.468895435333252



In [None]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'hp_metric': 0.2509012222290039, 'test_acc': 0.2509012222290039}
--------------------------------------------------------------------------------


[{'hp_metric': 0.2509012222290039, 'test_acc': 0.2509012222290039}]

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## Model Diagnosis

In [None]:
loader = sst_dataset.test_dataloader()
batch = next(iter(loader))

In [None]:
label, text, text_length = batch
label.shape, text.shape, text_length.shape

(torch.Size([128]), torch.Size([128, 46]), torch.Size([128]))

In [None]:
def k_missclassified(batch, model, datamodule, k=10):
    model.eval()
    with torch.no_grad():
        label, text, text_length = batch

        logits, in_state = model(text, text_length)
    
        pred = torch.argmax(F.log_softmax(logits, dim=1), dim=1)
        
        acc = accuracy(pred, label)

    miss_idx = pred != label

    vocab = datamodule.get_vocab()
    for t, l, p in zip(text.numpy()[miss_idx][:k], label.numpy()[miss_idx][:k], pred.numpy()[miss_idx][:k]):
        sentence = ' '.join(vocab.itos[x] for x in t).replace(" <pad>", "")
        print('sentence: ', sentence)
        print(f'label: {datamodule.dataset_train.get_labels()[l]}, predicted: {datamodule.dataset_train.get_labels()[p]}')
        print('\n')

In [None]:
k_missclassified(batch, sst_model, sst_dataset)

sentence:  If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .
label: positive, predicted: neutral


sentence:  <unk> as something rare , an issue movie that 's so honest and <unk> observed that it does n't feel like one .
label: very positive, predicted: positive


sentence:  Offers that rare combination of entertainment and education .
label: very positive, predicted: positive


sentence:  But he somehow pulls it off .
label: positive, predicted: neutral


sentence:  Take Care of My Cat offers a refreshingly different slice of Asian cinema .
label: positive, predicted: very negative


sentence:  <unk> Wendigo is <unk> why we go to the cinema : to be <unk> through the eye , the heart , the mind .
label: positive, predicted: very positive


sentence:  <unk> if overly talky documentary .
label: neutral, predicted: positive


sentence:  The movie 's ripe , <unk> beauty will <unk> those willing to probe its <unk> mysteries .
label: positive, predicted

In [None]:
def k_correctclassified(batch, model, datamodule, k=10):
    model.eval()
    with torch.no_grad():
        label, text, text_length = batch

        logits, in_state = model(text, text_length)
    
        pred = torch.argmax(F.log_softmax(logits, dim=1), dim=1)
        
        acc = accuracy(pred, label)

    miss_idx = label == pred

    vocab = datamodule.get_vocab()
    for t, l, p in zip(text.numpy()[miss_idx][:k], label.numpy()[miss_idx][:k], pred.numpy()[miss_idx][:k]):
        sentence = ' '.join(vocab.itos[x] for x in t).replace(" <pad>", "")
        print('sentence: ', sentence)
        print(f'label: {datamodule.dataset_train.get_labels()[l]}, predicted: {datamodule.dataset_train.get_labels()[p]}')
        print('\n')

In [None]:
k_correctclassified(batch, sst_model, sst_dataset)

sentence:  Effective but too - tepid biopic
label: neutral, predicted: neutral


sentence:  The film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .
label: neutral, predicted: neutral


sentence:  Perhaps no picture ever made has more literally <unk> that the road to hell is paved with good intentions .
label: positive, predicted: positive


sentence:  Steers turns in a snappy screenplay that <unk> at the edges ; it 's so clever you want to hate it .
label: positive, predicted: positive


sentence:  This is a film well worth seeing , talking and singing heads and all .
label: very positive, predicted: very positive


sentence:  What really surprises about Wisegirls is its low - key quality and genuine tenderness .
label: positive, predicted: positive


sentence:  One of the greatest family - oriented , fantasy - adventure movies ever .
label: very positive, predicted: very positive


sentence:  Ultimately

## Tensorboard Log: https://tensorboard.dev/experiment/h1GB1XeEQQKDGqTgXqVJgw/#scalars

## Misc Stuff

In [None]:
! ls

lightning_logs	sample_data  sst_dataset.zip


In [None]:
ls lightning_logs/version_0

[0m[01;34mcheckpoints[0m/
events.out.tfevents.1622663662.f8a649598365.58.0
events.out.tfevents.1622664824.f8a649598365.58.1
hparams.yaml


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
! ls /gdrive/MyDrive/END2.0/05_NLP_Augment/

lightning_logs		   sst_dataset_translated.csv
sst_dataset_augmented.csv  sst_dataset_translated_gsheet.gsheet
sst_dataset_cleaned.csv    sst_dataset.zip
SST_Dataset.ipynb	   SSTModel.ipynb
sst_dataset_synonym.csv


In [None]:
# ! cp -r /gdrive/MyDrive/END2.0/05_NLP_Augment/lightning_logs .

In [None]:
# ! cp -r lightning_logs /gdrive/MyDrive/END2.0/05_NLP_Augment/

In [None]:
# drive.flush_and_unmount()

In [None]:
# ! rm -r lightning_logs

In [None]:
# ! du -sh *

838M	lightning_logs
55M	sample_data
4.9M	sst_dataset.zip


In [None]:
! tensorboard dev upload --logdir lightning_logs \
    --name "END2 07_Seq2Seq Redo of SST Model - Satyajit" \
    --description "SST Dataset No Augmentation, Small Model"
