<a href="https://colab.research.google.com/github/prabhsuratsingh/Recurrent-Neural-Networks/blob/master/RNN_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas matplotlib



In [2]:
!pip install torch datasets



In [3]:
import torch
print(torch.__version__)

2.9.0+cu126


In [83]:
from datasets import load_dataset

dataset = load_dataset("imdb")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(train_dataset[0])


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [84]:
from torch.utils.data import random_split
torch.manual_seed(1)

train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000]
)

In [85]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)

    emoticons = re.findall(
        r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()
    )

    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()

    return tokenized

token_counts = Counter()

for item in train_dataset:
    line = item["text"]
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab size : ', len(token_counts))


Vocab size :  69006


In [87]:
class Vocab:
    def __init__(self, token_counts, min_freq=1):
        sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

        self.pad_token = "<pad>"
        self.unk_token = "<unk>"

        self.itos = [self.pad_token, self.unk_token] + [tok for tok, freq in sorted_tokens if freq >= min_freq]
        self.stoi = {tok: idx for idx, tok in enumerate(self.itos)}

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, token):
        """Allow vocab[token] just like torchtext"""
        return self.stoi.get(token, self.stoi[self.unk_token])

    def to_index(self, token):
        return self.stoi.get(token, self.stoi[self.unk_token])

    def to_token(self, idx):
        return self.itos[idx]

vocab = Vocab(token_counts)
print(len(vocab))


69008


In [88]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]


In [89]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: float(x)

In [90]:
def collate_batch(batch):
    text_list = []
    label_list = []
    lengths = []

    for item in batch:
        text = item["text"]
        label = item["label"]

        # convert label to float (0.0 or 1.0)
        label_list.append(float(label))

        # numericalize text
        token_ids = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(token_ids)
        lengths.append(len(token_ids))

    # convert labels and lengths to tensors
    label_list = torch.tensor(label_list, dtype=torch.float32)
    lengths = torch.tensor(lengths)

    # pad sequences
    padded_text = nn.utils.rnn.pad_sequence(
        text_list,
        batch_first=True,
        padding_value=0
    )

    return padded_text, label_list, lengths


In [91]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    train_dataset ,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_batch
)

In [92]:
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34414,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42944,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [94]:
print(label_batch)

tensor([1., 1., 1., 0.])


In [95]:
print(length_batch)

tensor([165,  86, 218, 145])


In [96]:
print(text_batch.shape)

torch.Size([4, 218])


In [97]:
batch_size =32

train_dl = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch
)


valid_dl = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)


test_dl = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

In [98]:
from torch import nn

class RNN(nn.Module):
  def __init__(
      self,
      vocab_size,
      embed_dim,
      rnn_hidden_size,
      fc_hidden_size
  ):
    super().__init__()

    self.embedding = nn.Embedding(
        vocab_size,
        embed_dim,
        padding_idx=0
    )
    self.rnn = nn.LSTM(
        embed_dim,
        rnn_hidden_size,
        batch_first=True
    )
    self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(fc_hidden_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, text, lengths):
    out = self.embedding(text)
    out = nn.utils.rnn.pack_padded_sequence(
        out,
        lengths.cpu().numpy(),
        enforce_sorted=False,
        batch_first=True
    )
    out, (hidden, cell) = self.rnn(out)
    out = hidden[-1, :, :]
    out = self.fc1(out)
    out = self.relu(out)
    out = self.fc2(out)
    out = self.sigmoid(out)

    return out

In [99]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)

model = RNN(
    vocab_size,
    embed_dim,
    rnn_hidden_size,
    fc_hidden_size
)
model

RNN(
  (embedding): Embedding(69008, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [100]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [101]:
def train(dataloader):
  model.train()
  total_acc, total_loss = 0, 0
  for text_batch, label_batch, lengths in dataloader:
    optimizer.zero_grad()
    pred = model(text_batch, lengths)[:, 0]
    loss = loss_fn(pred, label_batch)
    loss.backward()
    optimizer.step()
    total_acc += (
        (pred >= 0.5).float() == label_batch
    ).float().sum().item()
    total_loss += loss.item() * label_batch.size(0)

  return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [102]:
def evaluate(dataloader):
  model.eval()
  total_acc, total_loss = 0, 0
  with torch.no_grad():
    for text_batch, label_batch, lengths in dataloader:
      pred = model(text_batch, lengths)[:, 0]
      loss = loss_fn(pred, label_batch)
      total_acc += (
        (pred >= 0.5).float() == label_batch
      ).float().sum().item()
      total_loss += loss.item() * label_batch.size(0)

  return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [None]:
num_epochs = 10
torch.manual_seed(1)

for epoch in range(num_epochs):
  acc_train, loss_train = train(train_dl)
  acc_valid, loss_valid = evaluate(valid_dl)

  print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

In [60]:
acc_test, _ = evaluate(test_dl)
print(f"Test Accuraccy: {acc_test:.4f}")

Test Accuraccy: 1.0000
