In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/END/Week4

/content/drive/MyDrive/END/Week4


In [3]:
import torch
from torchtext import data
from torchtext import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [5]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root = 'data')

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 22.5MB/s]


In [6]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [14]:
len(train_data), len(valid_data), len(test_data)

(17500, 7500, 25000)

In [9]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:29, 2.22MB/s]                           
100%|█████████▉| 399241/400000 [00:17<00:00, 22043.11it/s]

In [404]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

## Reversing the text
We can use torch.flip or list reversal to reverse the text. Lets try it out on one batch of the text. torch.flip is more efficient than the second method shown later


In [405]:
one_batch = list(train_iterator)[0].text[0]
one_batch

tensor([[  11,  314,   11,  ...,    0,   25,  403],
        [ 290,   12,   19,  ...,  251,  333,    2],
        [  13,  282, 2352,  ...,    3, 7151, 1684],
        ...,
        [2606,   40,   16,  ...,    1,    1,    1],
        [ 880,  235,   24,  ...,    1,    1,    1],
        [   4,   88,    4,  ...,    1,    1,    1]], device='cuda:0')

### Method 1: Using torch.flip

In [406]:
torch.flip(one_batch, (0, ))

tensor([[   4,   88,    4,  ...,    1,    1,    1],
        [ 880,  235,   24,  ...,    1,    1,    1],
        [2606,   40,   16,  ...,    1,    1,    1],
        ...,
        [  13,  282, 2352,  ...,    3, 7151, 1684],
        [ 290,   12,   19,  ...,  251,  333,    2],
        [  11,  314,   11,  ...,    0,   25,  403]], device='cuda:0')

### Method 2: Using list reversal using reversed function

**Original text**

In [407]:
" ".join([TEXT.vocab.itos[x] for x in one_batch[:, 0].detach().cpu()])

'I believe that war films should try to convey the terror of war , avoid idealism and respect some rudimentary military principles . <unk> barely does the first . <unk> being a Russian war film , I was expecting patriotism , sentimentality , beautiful poetic pictures , a lush score , Slavic <unk> and cruel Germans . What I did n\'t need was the naive love non - affair , the unrealistically silly war scenes and the abuse of the syrupy soundtrack in a film which avoided carefully all historical or political references ( <unk> , Nazism , Holocaust ) only to end on a passing but nonetheless insulting to our sense of history <unk> about " liberating Poland " . A missed opportunity as a film but not as propaganda apparently .'

**Reversed text**

In [408]:
" ".join(list(reversed([TEXT.vocab.itos[x] for x in one_batch[:, 0].detach().cpu()])))

'. apparently propaganda as not but film a as opportunity missed A . " Poland liberating " about <unk> history of sense our to insulting nonetheless but passing a on end to only ) Holocaust , Nazism , <unk> ( references political or historical all carefully avoided which film a in soundtrack syrupy the of abuse the and scenes war silly unrealistically the , affair - non love naive the was need n\'t did I What . Germans cruel and <unk> Slavic , score lush a , pictures poetic beautiful , sentimentality , patriotism expecting was I , film war Russian a being <unk> . first the does barely <unk> . principles military rudimentary some respect and idealism avoid , war of terror the convey to try should films war that believe I'

**This we have to do individually for each text using a for loop if we are using Method 2. Hence Method 1 is more efficient**

## Model Definition

Here we have to use 3 LSTM layers in a loop instead of using the layer parameter = 3 in the nn.LSTM layer.
This we can do using torch.Modulelist and passing the cell states and hidden states of one LSTM layer to other using a for loop 

In [409]:
import torch.nn as nn

In [458]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.multi_layered_rnn = nn.ModuleList([nn.LSTM(embedding_dim, hidden_dim)])
        self.multi_layered_rnn.extend([nn.Dropout(dropout), nn.LSTM(hidden_dim, hidden_dim)]*(n_layers-1))
        
        
        self.fc = nn.Linear(hidden_dim * 1, output_dim)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        for rnn in self.multi_layered_rnn:
          # print(packed_embedded)
          if not isinstance(rnn, torch.nn.modules.dropout.Dropout): 
            packed_embedded, (hidden, cell) = rnn(packed_embedded)
          else:
            packed_embedded, packed_lengths = nn.utils.rnn.pad_packed_sequence(packed_embedded)
            packed_embedded = nn.utils.rnn.pack_padded_sequence(rnn(packed_embedded), packed_lengths, enforce_sorted=False)
        
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [459]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 3
# BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            DROPOUT, 
            PAD_IDX)

In [460]:
print(model)

RNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (dropout): Dropout(p=0.2, inplace=False)
  (multi_layered_rnn): ModuleList(
    (0): LSTM(100, 256)
    (1): Dropout(p=0.2, inplace=False)
    (2): LSTM(256, 256)
    (3): Dropout(p=0.2, inplace=False)
    (4): LSTM(256, 256)
  )
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [461]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,393,385 trainable parameters


In [462]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [463]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0218, -0.0350,  0.1495,  ..., -0.5508,  0.4874, -0.1341],
        [-0.6345,  0.5735, -0.3945,  ..., -0.4865,  0.1097,  0.5122],
        [ 0.6406,  0.3674,  0.8373,  ..., -0.1512, -0.0284, -0.1952]])

In [464]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0218, -0.0350,  0.1495,  ..., -0.5508,  0.4874, -0.1341],
        [-0.6345,  0.5735, -0.3945,  ..., -0.4865,  0.1097,  0.5122],
        [ 0.6406,  0.3674,  0.8373,  ..., -0.1512, -0.0284, -0.1952]])


In [465]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [466]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [467]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

**In the train function we are reversing the text online as we get the data from the batch. This was it will be less memory intensive**

In [468]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        text_lengths = text_lengths.cpu()
        #Reversing the text
        text = torch.flip(text, (0, ))

        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [469]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            text_lengths = text_lengths.cpu()
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [470]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [471]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 31s
	Train Loss: 0.679 | Train Acc: 57.02%
	 Val. Loss: 0.662 |  Val. Acc: 62.69%
Epoch: 02 | Epoch Time: 0m 32s
	Train Loss: 0.659 | Train Acc: 60.13%
	 Val. Loss: 0.672 |  Val. Acc: 65.51%
Epoch: 03 | Epoch Time: 0m 31s
	Train Loss: 0.506 | Train Acc: 75.81%
	 Val. Loss: 0.362 |  Val. Acc: 84.97%
Epoch: 04 | Epoch Time: 0m 32s
	Train Loss: 0.331 | Train Acc: 86.30%
	 Val. Loss: 0.346 |  Val. Acc: 85.90%
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.293 | Train Acc: 88.18%
	 Val. Loss: 0.314 |  Val. Acc: 87.20%
Epoch: 06 | Epoch Time: 0m 31s
	Train Loss: 0.233 | Train Acc: 91.00%
	 Val. Loss: 0.319 |  Val. Acc: 87.50%
Epoch: 07 | Epoch Time: 0m 32s
	Train Loss: 0.198 | Train Acc: 92.61%
	 Val. Loss: 0.373 |  Val. Acc: 85.64%
Epoch: 08 | Epoch Time: 0m 32s
	Train Loss: 0.180 | Train Acc: 93.31%
	 Val. Loss: 0.409 |  Val. Acc: 85.76%
Epoch: 09 | Epoch Time: 0m 32s
	Train Loss: 0.150 | Train Acc: 94.61%
	 Val. Loss: 0.306 |  Val. Acc: 88.05%
Epoch: 10 | Epoch T

In [472]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.338 | Test Acc: 85.84%


In [473]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    rev_tokenized = list(reversed(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in rev_tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [474]:
sentence = "This film is terrible"
predict_sentiment(model, sentence)

0.0022441146429628134

In [475]:
sentence = "This film is awesome"
predict_sentiment(model, sentence)

0.9864891767501831