# Imports

In [24]:
!pip install datasets transformers --quiet



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import BertTokenizerFast

# Check device


In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)


Device: cuda


# Load AG News Dataset

In [27]:
dataset = load_dataset("ag_news")


In [28]:
train_set = dataset['train']
test_set = dataset['test']

In [29]:
print(f"Train size: {len(train_set)}, Test size: {len(test_set)}")
print("Example:", train_set[0])

Train size: 120000, Test size: 7600
Example: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [30]:
train_set.shape

(120000, 2)

In [31]:
train_set.column_names

['text', 'label']

# Tokenizer

In [32]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenization + Padding + Truncation


In [33]:
def tokenize(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=20
    )

## Map_style

In [34]:
train_dataset = train_set.map(tokenize, batched=True)
test_dataset = test_set.map(tokenize, batched=True)


# Set Format for PyTorch

In [35]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# DataLoader


In [36]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Check a batch


In [37]:
batch = next(iter(train_loader))
batch

{'label': tensor([1, 0, 0, 0, 3, 2, 0, 2, 0, 2, 1, 1, 1, 2, 2, 0, 1, 1, 3, 1, 0, 1, 0, 2,
         2, 1, 0, 0, 0, 2, 1, 0]),
 'input_ids': tensor([[  101, 12281,  2534,  6160,  2015,  3123,  1999, 10637,  2069,  3198,
           2151,  3598,  5470,  2000,  2862,  1996,  4602,  2867,  1999,   102],
         [  101,  2859,  1005,  1055,  2280,  2343, 20613,  4332,  2058,  2197,
           2695,  2000, 15876,  1006,  3010,  2811,  1007,  3010,  2811,   102],
         [  101,  4922,  5222,  2491,  2058,  4001,  2000, 21245,  6218,  2006,
           2373,  2660,  1001,  4464,  1025,  1055,  3539,  2704,  1010,   102],
         [  101, 22129,  3003,  2018,  3449, 13936,  3956,  2005,  2086,  1006,
           9706,  1007,  9706,  1011,  4748,  7229,  2632,  1011,  1043,   102],
         [  101, 18106,  8039,  2188,  2678,  9260, 25975,  5815,  2000,  2049,
          14927,  2005,  3617, 12126,  1998,  2678, 17792,  5130,  1010,   102],
         [  101,  1057,  5910, 23311,  2849,  1997, 10396

In [38]:
print("Input IDs shape:", batch['input_ids'].shape)
print("Labels shape:", batch['label'].shape)

Input IDs shape: torch.Size([32, 20])
Labels shape: torch.Size([32])


# Model

In [39]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, RNN_type, vocab_size, embedding_dim, hidden_size, num_layers, bidirectional, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = RNN_type(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch, seq_len, embed_dim]
        outputs, _ = self.rnn(x)  # [batch, seq_len, hidden*directions]
        outputs = outputs.mean(dim=1)  
        y = self.fc(outputs)  # [batch, num_classes]
        return y
