In [1]:
!pip install portalocker



In [2]:
import torch
torch.manual_seed(1)

import torch.nn as nn

from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

Before we can feed the data into an RNN model, we need to apply several preprocessing steps:
1. Split the training dataset into separate training and validation partitions.
2. Identify the unique words in the training dataset
3. Map each unique word to a unique integer and encode the review text into encoded integers
(an index of each unique word)
4. Divide the dataset into mini-batches as input to the model

In [3]:
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')
test_dataset = list(test_dataset)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [4]:
valid_dataset

<torch.utils.data.dataset.Subset at 0x7e6c2a78bdf0>

In [5]:
type(list(test_dataset))

list

In [6]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)

    # Find Emoji and replace hyphens
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', '')

    tokenized = text.split()
    return tokenized


token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [7]:
print(token_counts)



In [8]:
# Encoding each token into integers

from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(
    token_counts.items(), key=lambda x: x[1], reverse=True
)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

#print([vocab[token] for token in ['this', 'is', 'an', 'example']])


In [21]:
#  define the functions for transformation


device = torch.device("cuda:0")
#device = 'cpu'
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 2 else 0.


## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [22]:
## Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
#print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([1., 1., 1., 0.], device='cuda:0')
tensor([165,  86, 218, 145], device='cuda:0')
torch.Size([4, 218])


In [23]:
# batching the datasets

batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [12]:
type(test_dl)

torch.utils.data.dataloader.DataLoader

In [24]:
embedding = nn.Embedding(num_embeddings=10,
                         embedding_dim=3,
                         padding_idx=0)

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-1.0055, -0.2106, -0.0075],
         [ 1.6734,  0.0103, -0.7040],
         [-0.4610, -0.5601,  0.1196],
         [-0.1631,  0.6614,  1.1899]],

        [[-0.4610, -0.5601,  0.1196],
         [-0.1853, -0.9962, -0.8313],
         [ 1.6734,  0.0103, -0.7040],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [14]:
torch.manual_seed(1)


# Fully connected neural network with one hidden layer
# class RNN(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super().__init__()
#         self.rnn = nn.RNN(input_size,
#                           hidden_size,
#                           num_layers=2,
#                           batch_first=True)
#         #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
#         #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, 1)

#     def forward(self, x):
#         _, hidden = self.rnn(x)
#         out = hidden[-1, :, :]
#         out = self.fc(out)
#         return out

# model = RNN(64, 32)

# print(model)

# model(torch.randn(5, 3, 64))

<torch._C.Generator at 0x7e6cf4521470>

In [25]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)
print(model)

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [26]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
        #print("train loader")
    #print("train")
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
    #return total_acc/500., total_loss/500.


def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)


In [27]:

num_epochs = 10

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    #print("epo")
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6024 val_accuracy: 0.5952
Epoch 1 accuracy: 0.7421 val_accuracy: 0.7700
Epoch 2 accuracy: 0.6656 val_accuracy: 0.7596
Epoch 3 accuracy: 0.7850 val_accuracy: 0.7280
Epoch 4 accuracy: 0.8284 val_accuracy: 0.8132
Epoch 5 accuracy: 0.8783 val_accuracy: 0.8426
Epoch 6 accuracy: 0.9082 val_accuracy: 0.8510
Epoch 7 accuracy: 0.9300 val_accuracy: 0.8586
Epoch 8 accuracy: 0.9453 val_accuracy: 0.8622
Epoch 9 accuracy: 0.9462 val_accuracy: 0.8646


In [28]:
torch.save(model,"lstm.ph")

In [29]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8526
