<a href="https://colab.research.google.com/github/srirampattabiraman/Extensive-NLP-and-Deep-Learning/blob/main/session4/END2_Session_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [2]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 55.7MB/s]


In [3]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['Before', 'Cujo', ',', 'there', 'was', 'Lucky', 'the', 'devil', 'dog', '.', 'In', '1978,on', 'Halloween', 'night', 'the', 'movie"Devil', 'Dog', ',', 'The', 'Hound', 'of', 'Hell', '"', 'premiered', '.', 'A', 'story', 'of', 'a', 'family', 'getting', 'a', 'new', 'puppy', '(', 'from', 'a', 'farmer', 'who', 'just', 'happen', 'to', 'be', 'in', 'the', 'neighborhood', 'selling', 'fruits', 'and', 'vegetables', ')', 'because', 'their', 'dog', 'Skipper', 'was', 'killed', '.', 'Coencidence', '?', 'Everyone', 'loves', 'the', 'new', 'dog', ',', 'but', 'there', 'is', 'something', 'strange', 'about', 'him', '.', '<', 'br', '/><br', '/>It', 'is', "n't", 'long', 'until', 'the', 'father', 'Mike', 'Barry(Richard', 'Crenna', ',', 'First', 'Blood)starts', 'to', 'notice', '.', 'His', 'wife', 'Betty(Yvette', 'Mimieux', ',', 'Where', 'The', 'Boys', 'Are', ',', 'Jackson', 'County', 'Jail', ',', 'Snowbeast)is', 'different', 'and', 'his', 'kids', 'Charlie', 'and', 'Bonnie(Ike', 'Eisenman', ',', 'Witch',

In [5]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [47]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [48]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [49]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [50]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 203063), (',', 192343), ('.', 166129), ('and', 109794), ('a', 109526), ('of', 100808), ('to', 93971), ('is', 76428), ('in', 61581), ('I', 54306), ('it', 53609), ('that', 49177), ('"', 44610), ("'s", 43276), ('this', 42373), ('-', 36806), ('/><br', 35659), ('was', 35096), ('as', 30590), ('with', 30113)]


In [10]:
print(TEXT.vocab.itos[:10]) ## A list of token strings indexed by their numerical identifiers.

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [12]:
print(LABEL.vocab.stoi) ## A collections.defaultdict instance mapping token strings to numerical identifiers.

defaultdict(None, {'neg': 0, 'pos': 1})


In [96]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [98]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU()
        )
        self.fc2 = nn.Linear(128, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]

        output, (hidden, cell_state) = self.lstm(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc2(self.fc1(hidden.squeeze(0)))

In [103]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [104]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,899,817 trainable parameters


In [117]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [118]:
criterion = nn.BCEWithLogitsLoss()

In [119]:
model = model.to(device)
criterion = criterion.to(device)

In [120]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [121]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [122]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [123]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [124]:
N_EPOCHS = 30

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 28s
	Train Loss: 0.694 | Train Acc: 49.89%
	 Val. Loss: 0.693 |  Val. Acc: 49.71%
Epoch: 02 | Epoch Time: 0m 28s
	Train Loss: 0.693 | Train Acc: 49.94%
	 Val. Loss: 0.694 |  Val. Acc: 49.72%
Epoch: 03 | Epoch Time: 0m 29s
	Train Loss: 0.693 | Train Acc: 49.97%
	 Val. Loss: 0.693 |  Val. Acc: 49.19%
Epoch: 04 | Epoch Time: 0m 29s
	Train Loss: 0.693 | Train Acc: 50.25%
	 Val. Loss: 0.702 |  Val. Acc: 51.58%
Epoch: 05 | Epoch Time: 0m 28s
	Train Loss: 0.692 | Train Acc: 50.50%
	 Val. Loss: 0.697 |  Val. Acc: 51.73%
Epoch: 06 | Epoch Time: 0m 29s
	Train Loss: 0.691 | Train Acc: 50.51%
	 Val. Loss: 0.698 |  Val. Acc: 52.36%
Epoch: 07 | Epoch Time: 0m 29s
	Train Loss: 0.689 | Train Acc: 50.48%
	 Val. Loss: 0.731 |  Val. Acc: 52.94%
Epoch: 08 | Epoch Time: 0m 28s
	Train Loss: 0.689 | Train Acc: 50.81%
	 Val. Loss: 0.720 |  Val. Acc: 51.97%
Epoch: 09 | Epoch Time: 0m 29s
	Train Loss: 0.688 | Train Acc: 50.47%
	 Val. Loss: 0.714 |  Val. Acc: 52.36%
Epoch: 10 | Epoch T

In [125]:
 model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.629 | Test Acc: 75.30%


Observations:

1. Just by adding LSTM layer along with fully connected layer accuracy saturated at 50% irrespective of epochs
Results below
====
  Epoch: 27 | Epoch Time: 0m 28s
	  Train Loss: 0.693 | Train Acc: 50.36%
	  Val. Loss: 0.693 |  Val. Acc: 50.17%
  
  Epoch: 28 | Epoch Time: 0m 28s
	  Train Loss: 0.693 | Train Acc: 50.36%
	  Val. Loss: 0.693 |  Val. Acc: 50.20%
  
  Epoch: 29 | Epoch Time: 0m 28s
	  Train Loss: 0.693 | Train Acc: 50.44%
	  Val. Loss: 0.693 |  Val. Acc: 50.16%
  
  Epoch: 30 | Epoch Time: 0m 28s
	  Train Loss: 0.693 | Train Acc: 50.40%
	  Val. Loss: 0.693 |  Val. Acc: 50.16% **bold text**

2. Hence changed the optimizer from SGD to ADAM that possesses adaptive learning rate which eventually reduces loss upon epochs and considerable increase in validation accuracy is observed.

3. Even though Training accuracy is improved after changing the activation function, test accuracy is very less. hence the model is overfitting (Low Bias and High variance)