## Dataset loading

In [2]:
from torchtext.datasets import AG_NEWS

train_iter, test_iter = AG_NEWS(split=('train', 'test'))
for label, text in train_iter:
    print(label, text)
    break

OSError: /usr/local/lib/python3.11/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch6detail10class_baseC2ERKSsS3_SsRKSt9type_infoS6_

## Since it is showing incompatibility, we must unistall the current versions, and download the compatible versions. torch 2.2.0 and torchtext 0.17.0 In addition, we will need to install portalocker too.

In [2]:
!pip show torch torchtext portalocker

[0mName: torch
Version: 2.6.0+cu124
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-cusparselt-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, fastai, peft, sentence-transformers, timm, torchaudio, torchdata, torchvision


In [3]:
!pip uninstall torch torchtext -y
!pip install torch==2.2.0 torchtext==0.17.0

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
[0mCollecting torch==2.2.0
  Downloading torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12

In [4]:
!pip uninstall -y portalocker
!pip install portalocker

[0mCollecting portalocker
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker
Successfully installed portalocker-3.1.1


In [5]:
from torchtext.datasets import AG_NEWS


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

In [6]:
train_iter, test_iter = AG_NEWS(split=('train', 'test'))
for label, text in train_iter:
    print(label, text)
    break

3 Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


## Preprocessing

In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])




## Text and label pipeline

In [8]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1  # Labels start from 1

## Dataloader

In [9]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_batch(batch):
    label_list, text_list = [], []
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, batch_first=True)

train_iter, test_iter = AG_NEWS(split=('train', 'test'))
train_dataloader = DataLoader(list(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)

In [14]:
for batch in train_dataloader:
    print(batch[0], batch[1])
    ## convert batch[1] to normal text
    for i in range(len(batch[1])):
        print(vocab.lookup_tokens(batch[1][i].tolist()))
    break

tensor([2, 0, 2, 0, 1, 2, 2, 2]) tensor([[  452,  1241,   268,   945,  4648,    34,  4812,   823,  8846,   452,
            58,  1241,   324,     4,   801,     5,   261,     6,  1487,     5,
           715,    55,    34,     5,   710,  6667,  2022,   135,     5,   266,
          1792,    58,   719,  1694,     8,    19,  1807,  4279,   760,    68,
          1841,    58,  1309,    63,     2,  2767,    17,  6640,    39,    81,
          5506,     7,   549,   394,     1],
        [ 9265,  1864,    58,  1810,     7,   537,    70,   570,   670,   140,
            53,    58,  1810,    29,   537,    70,    39,  5132, 22146,   458,
             5,   234,     6,  2846,   405,    24,  9265,     1,    48,   673,
           147,    29,     2,   211,    58,    54,     3,  4525,    10,  3002,
             6,  9444,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [   51,     1,     9,     1,  1083,     4,   382,  1946,    58,  3563,
        

## Model deifiniton

In [15]:
import torch.nn as nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text).mean(1)  # average pooling
        return self.fc(embedded)

num_class = 4
vocab_size = len(vocab)
embed_dim = 64
model = TextClassificationModel(vocab_size, embed_dim, num_class)

In [16]:
import torch.optim as optim

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=4.0)

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    for labels, texts in dataloader:
        optimizer.zero_grad()
        output = model(texts)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

train(train_dataloader)

In [18]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for labels, texts in dataloader:
            output = model(texts)
            total_acc += (output.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    return total_acc / total_count

test_dataloader = DataLoader(list(test_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)
test_accuracy = evaluate(test_dataloader)
print(f"Test Accuracy: {test_accuracy}")


Test Accuracy: 0.525


In [19]:
## Training for 5 epochs
## reinitialization
model = TextClassificationModel(vocab_size, embed_dim, num_class)
optimizer = optim.SGD(model.parameters(), lr=4.0)
import time
for epoch in range(5):
    epoch_start_time = time.time()
    train(train_dataloader)
    test_accuracy = evaluate(test_dataloader)
    train_accuracy = evaluate(train_dataloader)
    epoch_end_time = time.time()
    print(f"Epoch {epoch+1}: Train Accuracy: {train_accuracy}, Test Accuracy: {test_accuracy}, Time: {epoch_end_time - epoch_start_time}s")

Epoch 1: Train Accuracy: 0.8015916666666667, Test Accuracy: 0.7988157894736843, Time: 190.35771560668945s
Epoch 2: Train Accuracy: 0.8193083333333333, Test Accuracy: 0.8113157894736842, Time: 185.92827534675598s
Epoch 3: Train Accuracy: 0.8449, Test Accuracy: 0.8373684210526315, Time: 194.4447500705719s
Epoch 4: Train Accuracy: 0.832225, Test Accuracy: 0.8193421052631579, Time: 188.4210057258606s
Epoch 5: Train Accuracy: 0.7413666666666666, Test Accuracy: 0.7393421052631579, Time: 204.13484168052673s
