# RNN for Text Classification

In this task, we will utilize torchtext for text processing and use a LSTM model for text classification.

In [None]:
!pip uninstall -y torch torchdata torchvision torchtext torchaudio fastai
!pip install portalocker
!pip install --pre torch torchdata -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
!pip install torchtext

Found existing installation: torch 1.13.1+cu116
Uninstalling torch-1.13.1+cu116:
  Successfully uninstalled torch-1.13.1+cu116
[0mFound existing installation: torchvision 0.14.1+cu116
Uninstalling torchvision-0.14.1+cu116:
  Successfully uninstalled torchvision-0.14.1+cu116
Found existing installation: torchtext 0.14.1
Uninstalling torchtext-0.14.1:
  Successfully uninstalled torchtext-0.14.1
Found existing installation: torchaudio 0.13.1+cu116
Uninstalling torchaudio-0.13.1+cu116:
  Successfully uninstalled torchaudio-0.13.1+cu116
Found existing installation: fastai 2.7.11
Uninstalling fastai-2.7.11:
  Successfully uninstalled fastai-2.7.11
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext
  Downloading torchtext-0.15.1-cp39-cp39-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.0.0
  Downloading torch-2.0.0-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchdata==0.6.0
  Downloading torchdata-0.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting triton==2.0.0
  Downloading triton-2.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m19.4 MB/s[0m eta 

In [None]:
import torch
# AG_NEWS is a NEWS classificatioin dataset with 4 labels
# 1 : World 2 : Sports 3 : Business 4 : Sci/Tec
from torchtext.datasets import AG_NEWS


In [None]:
# data format (label, text)
train_iter = iter(AG_NEWS(split='train'))
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [None]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Prepare data processing pipelines
Get text tokens and set the special symbol

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
# The vocabulary block converts a list of tokens into integers.
print(vocab(['here', 'is', 'an', 'example']))
# unk
print('unk', vocab['<unk>'])
# pad
print('pad', vocab['<pad>'])

[476, 22, 31, 5298]
unk 0
pad 1


In [None]:
# tokenizer("Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
train_iter = iter(AG_NEWS(split='train'))
label, text = next(train_iter)
tokenizer(text)

['wall',
 'st',
 '.',
 'bears',
 'claw',
 'back',
 'into',
 'the',
 'black',
 '(',
 'reuters',
 ')',
 'reuters',
 '-',
 'short-sellers',
 ',',
 'wall',
 'street',
 "'",
 's',
 'dwindling\\band',
 'of',
 'ultra-cynics',
 ',',
 'are',
 'seeing',
 'green',
 'again',
 '.']

In [None]:
# Prepare the text processing pipeline with the tokenizer and vocabulary.
# These two pipeline convert tokens to numbers for model processing
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1


## Generate data batch and iterator
We utilize Pytorch Dataloader to generate batch data.
collate_batch function processes the batch data before sending them to the model.

In [None]:
from torch.utils.data import DataLoader
import numpy as np
# utilize gpu to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device', device)

def collate_batch(batch):
    label_list, text_list, len_list = [], [], []
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         len_list.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    len_list = len_list
    mx_len = np.max(len_list)
    txt_pad_list = []
    for ti in text_list:
      pad_ = torch.tensor([vocab['<pad>']] * (mx_len - len(ti)), dtype=torch.int64)
      txt_pad_list.append(torch.cat([ti, pad_]))
    txt_pad_list = torch.stack(txt_pad_list, dim=0)
    return label_list.to(device), txt_pad_list.to(device), len_list

# Example
train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=3, shuffle=False, collate_fn=collate_batch)
for idx, (label, text, length_list) in enumerate(dataloader):
  print('label', label)
  print('text', text)
  print('length', length_list)
  break

device cpu
label tensor([2, 2, 2])
text tensor([[  432,   426,     2,  1606, 14839,   114,    67,     3,   849,    14,
            28,    15,    28,    16, 50726,     4,   432,   375,    17,    10,
         67508,     7, 52259,     4,    43,  4010,   784,   326,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [15875,  1073,   855,  1311,  4251,    14,    28,    15,    28,    16,
           930,   798,   321, 15875,    99,     4, 27658,    29,     6,  4460,
            12,   565, 52791,     9, 80618,  2126,     8,     3,   526,   242,
             4,    29,  3891, 82815,  6575,    11,   207,   360,     7,     3,
           127,     2],
        [   59,     9,   348,  4583,   152,    17,   739,    14,    28,    15,
            28,    16,  2385,   453,    93,  2060, 27361,     3,   348,     9,
             3,   739,    12,   272,    43,   241, 51954,    39,     3,   295,
           127,   113,    86,   221,     3,

## Define the LSTM model


In [None]:
import torch.nn as nn
class LSTM_net(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class, pad_idx, bidirectional):

    super().__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

    self.rnn = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, num_layers = 1, bidirectional=bidirectional, batch_first=True)

    if bidirectional:
      out_hid = hidden_dim * 2
    else:
      out_hid = hidden_dim
    self.fc = nn.Sequential(
            nn.Linear(out_hid, out_hid), nn.ReLU(), nn.Linear(out_hid, num_class))

  def forward(self, text, text_lengths):

    # text = [batch size, sent len]

    embedded = self.embedding(text)
    # print(embedded.shape)
    # print(embedded)

    # embedded = [batch size, sent len, emb dim]

    #pack sequence to handle sequences with varied length
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
    # print(packed_embedded.shape)
    # print(packed_embedded)

    packed_output, (hidden, cell) = self.rnn(packed_embedded)
    # utilize the last time step representation to represent the sequence
    if bidirectional:
      out = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    else:
      out = hidden[-1,:,:]

    output = self.fc(out)


    return output

In [None]:
# We build a model with the embedding dimension of 64.
# The vocab size is equal to the length of the vocabulary instance.
# The number of classes is equal to the number of labels
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 200
hidden_dim = 256
bidirectional = False
model = LSTM_net(vocab_size, emsize, hidden_dim, num_class, vocab['<pad>'], bidirectional=bidirectional).to(device)

## Define functions to train the model and evaluate results.

In [None]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        # break
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

## Split the dataset and run the model

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 1 # epoch
LR = 0.001  # learning rate
BATCH_SIZE = 64 # batch size for training

# define loss funtion
criterion = torch.nn.CrossEntropyLoss()

# define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


total_accu = None
train_iter, test_iter = AG_NEWS()

# Convert iterable-style dataset to map-style dataset
# Reason: Pytorch Dataloader works with a map-style dataset that implements the getitem() and len() protocols, and represents a map from indices/keys to data samples.
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1782 batches | accuracy    0.681
| epoch   1 |  1000/ 1782 batches | accuracy    0.855
| epoch   1 |  1500/ 1782 batches | accuracy    0.879
-----------------------------------------------------------
| end of epoch   1 | time: 1078.96s | valid accuracy    0.898 
-----------------------------------------------------------


## Evaluate the model with test dataset

In [None]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.896


## Test on a random news


In [None]:
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.stack([torch.tensor(text_pipeline(text))])
        output = model(text, [len(text)])
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, text_pipeline)])

This is a Sports news
