# Imports

In [2]:
!pip install datasets transformers --quiet



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import BertTokenizerFast

# Check device


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)


Device: cuda


# Load AG News Dataset

In [5]:
dataset = load_dataset("ag_news")


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/ag_news/resolve/main/README.md (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: bb669261-1793-4d75-b868-52ed73f134a5)')' thrown while requesting HEAD https://huggingface.co/datasets/ag_news/resolve/main/README.md
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/ag_news/resolve/main/README.md (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 8fc09fe7-eb58-497a-800a-67005c6fcef5)')' thrown while requesting HEAD https://huggingface.co/datasets/ag_news/resolve/main/README.md
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool

In [9]:
train_set = dataset['train']
test_set = dataset['test']

In [10]:
print(f"Train size: {len(train_set)}, Test size: {len(test_set)}")
print("Example:", train_set[0])

Train size: 120000, Test size: 7600
Example: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [11]:
train_set.shape

(120000, 2)

In [13]:
train_set.column_names

['text', 'label']

# Tokenizer

In [14]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

'(ProtocolError('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)), '(Request ID: 5eb795d9-0a67-486a-94a7-c15d988f885e)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


# Tokenization + Padding + Truncation


In [15]:
def tokenize(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=20
    )

## Map_style

In [16]:
train_dataset = train_set.map(tokenize, batched=True)
test_dataset = test_set.map(tokenize, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]


# Set Format for PyTorch

In [17]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# DataLoader


In [18]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Check a batch


In [22]:
batch = next(iter(train_loader))
batch

{'label': tensor([0, 1, 3, 0, 3, 3, 1, 1, 3, 1, 2, 0, 0, 1, 0, 3, 0, 3, 3, 1, 3, 1, 0, 2,
         3, 3, 1, 0, 0, 3, 3, 2]),
 'input_ids': tensor([[  101,  3768,  1997,  3036,  7906,  2833,  2013, 21404,  2015,  2096,
           9652,  7501,  7186,  5694, 17677,  1996,  4534,  1997,  2175,   102],
         [  101,  5229, 29000,  2952,  5451,  2041,  1997,  4440,  2000,  6090,
          25457, 26416, 15710,  3000,  1024, 10792,  1011,  6778,  4754,   102],
         [  101,  2047, 10047,  6305, 15173,  3274,  2046,  4257,  3898,  1996,
           2047, 10047,  6305,  1010,  3005,  3937,  2944,  2003, 21125,   102],
         [  101, 18243, 27942, 12936,  2709,  2000,  1005,  2444,  1999,  3571,
           1005,  1997,  8396,  1047, 10686, 24778,  1010, 10411,  1006,   102],
         [  101,  1060,  2213,  2557, 12106,  2189,  3784,  1996,  2899,  1010,
           5887,  1011,  2241,  5871,  2557,  9224,  2047,  2326,  1011,   102],
         [  101,  7435,  5987,  2440,  7233,  2005,  3021

In [20]:
print("Input IDs shape:", batch['input_ids'].shape)
print("Labels shape:", batch['label'].shape)

Input IDs shape: torch.Size([32, 20])
Labels shape: torch.Size([32])


# Model

In [26]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, RNN_type, vocab_size, embedding_dim, hidden_size, num_layers, bidirectional, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = RNN_type(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch, seq_len, embed_dim]
        outputs, _ = self.rnn(x)  # [batch, seq_len, hidden*directions]
        outputs = outputs.mean(dim=1)  
        y = self.fc(outputs)  # [batch, num_classes]
        return y


# Select Parameters and Models

In [27]:
vocab_size = tokenizer.vocab_size
embedding_dim = 64
hidden_size = 128
num_layers = 1
bidirectional = True
num_classes = 4

model = RNNModel(nn.LSTM, vocab_size, embedding_dim, hidden_size,
                 num_layers, bidirectional, num_classes).to(device)
print(model)


RNNModel(
  (embedding): Embedding(30522, 64)
  (rnn): LSTM(64, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)


# Test One batch

In [28]:
batch = next(iter(train_loader))
inputs = batch['input_ids'].to(device)
outputs = model(inputs)
print("Output shape:", outputs.shape)  # باید [batch_size, num_classes] باشه


Output shape: torch.Size([32, 4])


# Define Loss & Optimizer

In [29]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# Training Loop

In [30]:
from tqdm import tqdm

model.train()
for batch in tqdm(train_loader, desc="Training"):
    inputs = batch['input_ids'].to(device)
    labels = batch['label'].to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()


Training: 100%|███████████████████████████████████████████████████████████████████| 3750/3750 [00:18<00:00, 198.28it/s]


# Define Loss & Optimizer

In [31]:
import torch.nn as nn
import torch.optim as optim

# Rnn model or lstm model 
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        x = self.embedding(x)           # [batch, seq_len] -> [batch, seq_len, embed_dim]
        outputs, _ = self.rnn(x)       # [batch, seq_len, hidden*2]
        out = outputs.mean(dim=1)      # average pooling over sequence length
        out = self.fc(out)             # [batch, num_classes]
        return out

# hyperparameters


In [34]:
# hyperparameters
vocab_size = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 128
num_layers = 1
num_classes = 4


# device
model = TextRNN(vocab_size, embed_dim, hidden_dim, num_layers, num_classes).to(device)

# Loss و Optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)

TextRNN(
  (embedding): Embedding(30522, 128)
  (rnn): LSTM(128, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)
