<a href="https://colab.research.google.com/github/pxs1990/DeepLearning/blob/main/RNN_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torchtext==0.16.0



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# 1. Sample Data
texts = [
    "I love this movie",
    "This film is amazing",
    "Worst movie ever",
    "I hated the film",
    "Best experience watching it",
    "Terrible acting",
    "Fantastic direction",
    "I wouldn't recommend it"
]
labels = ["positive", "positive", "negative", "negative", "positive", "negative", "positive", "negative"]

# Encode labels to 0/1
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenizer using PyTorch torchtext
# Tokenize texts
print("Tokenizing and building vocabulary...")
tokenizer = get_tokenizer("basic_english")

def tokenize(texts):
    for txt in texts:
        yield tokenizer(txt)

vocab = build_vocab_from_iterator(tokenize(texts), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
print(f"Vocab size: {len(vocab)}")

# Convert texts to integer sequences
tokenized_texts = [torch.tensor([vocab[token] for token in tokenizer(txt)]) for txt in texts]

# Padding sequences
max_len = 10
padded_sequences = pad_sequence(tokenized_texts, batch_first=True, padding_value=vocab["<pad>"])
print(f"Padded sequences shape: {padded_sequences.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Convert to PyTorch datasets
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

print("Data preparation complete!")


Tokenizing and building vocabulary...
Vocab size: 25
Padded sequences shape: torch.Size([8, 6])
Data preparation complete!


In [None]:
# 2. Define RNN Model
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden.squeeze(0))
        return self.sigmoid(output)

# Parameters
vocab_size = len(tokenizer.word_index) + 1
embed_size = 50
hidden_size = 32
output_size = 1

model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 3. Train the Model
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

In [None]:
# 4. Evaluate the Model
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch).squeeze()
        predictions = (predictions > 0.5).float()
        correct += (predictions == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Accuracy: {correct / total:.2f}")