<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Tutorials/Natural%20Language%20Processing/PyTorch%20Sentimental%20Analysis/BiDirectional_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

In [0]:
# Fixing SSL Bug Strange :3 
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
SEED = 1234
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fe373329d50>

In [0]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [0]:
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

In [0]:
traindata, validdata = trainset.split(random_state=random.seed(SEED))

In [18]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(traindata, max_size=MAX_VOCAB_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(traindata)

.vector_cache/glove.6B.zip: 862MB [00:44, 19.5MB/s]                           
100%|█████████▉| 398834/400000 [00:19<00:00, 21102.44it/s]

'Linux'

In [19]:
torch.cuda.is_available()

True

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
BATCH_SIZE = 64

# Generate Iterators 

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (traindata, validdata, testset),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=device)

In [0]:
class Model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, bidirectional, pad_idx):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        print(embedded.shape())
        packed_embedded = nn.utils.rnn.pack_padded_sequence(text, text_lengths)
        packed_output , (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat(output[-2,:,:], output[-1,:,:], dim=1))
        return self.fc(hidden.squeeze(0))






In [0]:
# Hyperparameters

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = Model(INPUT_DIM,
              EMBEDDING_DIM,
              HIDDEN_DIM,
              OUTPUT_DIM,
              N_LAYERS,
              DROPOUT,
              BIDIRECTIONAL,
              PAD_IDX)



In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [44]:
count_parameters(model)

4810857

In [46]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([25002, 100])

In [47]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-1.4654,  0.0499,  0.4693,  ..., -0.1296,  1.3465,  0.0561],
        [-0.3302,  0.5339,  0.7318,  ...,  0.0163,  0.1366,  0.3507],
        [ 0.3335,  0.0274,  0.1259,  ..., -0.0510,  0.3407,  0.6567]])

In [0]:
# This is done to remove the unk and pad to zero making our model understand that these words are not important for sentimental analysis classification
UNK_TOKEN_LOCATION = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_TOKEN_LOCATION] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [52]:
model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-1.4654,  0.0499,  0.4693,  ..., -0.1296,  1.3465,  0.0561],
        [-0.3302,  0.5339,  0.7318,  ...,  0.0163,  0.1366,  0.3507],
        [ 0.3335,  0.0274,  0.1259,  ..., -0.0510,  0.3407,  0.6567]])

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y ).float()
    return correct.sum() / len(correct)

In [0]:
# Training the Model 

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    # Put the model into train mode and enable dropout
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, text_length = batch.text
        prediction = model(text, text_length).squeeze(1)
        loss = criterion(prediction, batch.label)
        acc = binary_accuracy(prediction, batch.label)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss
        epoch_acc += acc

    return epoch_loss(len(iterator)) , epoch_acc(len(iterator))



In [0]:
# Evaluating Function 

