In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tqdm import tqdm
from transformers import BertTokenizer
from datasets import load_dataset
from sklearn.metrics import classification_report
import numpy as np
import gensim.downloader as api
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("Loading AG News Dataset...")
dataset = load_dataset('ag_news')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(text):
    return tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', max_length=128)

print("Loading Word2Vec Embeddings...")
word2vec = api.load('word2vec-google-news-300')

def get_word2vec_embeddings(tokens):
    embeddings = []
    for token in tokens:
        if token in word2vec:
            embeddings.append(word2vec[token])
        else:
            embeddings.append(np.zeros(word2vec.vector_size))
    return torch.tensor(embeddings, dtype=torch.float32)

class AGNewsDataset(data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = tokenizer.tokenize(self.texts[idx])
        embeddings = get_word2vec_embeddings(tokens)
        label = self.labels[idx]
        return embeddings, torch.tensor(label, dtype=torch.long)

print("Preparing datasets and dataloaders...")
train_texts = [item['text'] for item in dataset['train']]
train_labels = [item['label'] for item in dataset['train']]
test_texts = [item['text'] for item in dataset['test']]
test_labels = [item['label'] for item in dataset['test']]

train_dataset = AGNewsDataset(train_texts, train_labels)
test_dataset = AGNewsDataset(test_texts, test_labels)

def collate_fn(batch):
    embeddings = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = [len(seq) for seq in embeddings]
    padded_embeddings = pad_sequence(embeddings, batch_first=True)
    return padded_embeddings, torch.tensor(labels), torch.tensor(lengths)

train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

class SimpleRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(SimpleRNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input, h0)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        out = self.fc(output[:, -1, :])
        return out

input_size = word2vec.vector_size
hidden_size = 128
num_layers = 2
num_classes = 4

model = SimpleRNNModel(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and Evaluation
num_epochs = 10
best_accuracy = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for inputs, labels, lengths in tqdm(train_loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    train_accuracy = 100 * correct / total
    
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels, lengths in tqdm(test_loader, desc="Testing"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, lengths)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_accuracy = 100 * correct / total
    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')
    
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'best_model.pt')

# Load the best model
print("Loading best model...")
model.load_state_dict(torch.load('best_model.pt'))
model.to(device)

# Generate Classification Report
print("Generating classification report...")
y_true = []
y_pred = []
model.eval()
with torch.no_grad():
    for inputs, labels, lengths in tqdm(test_loader, desc="Evaluating"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs, lengths)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

print(classification_report(y_true, y_pred, target_names=dataset['test'].features['label'].names))


Using device: cuda
Loading AG News Dataset...
Loading Word2Vec Embeddings...
Preparing datasets and dataloaders...
Epoch 1/10


  return torch.tensor(embeddings, dtype=torch.float32)
Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1875/1875 [06:18<00:00,  4.95it/s]
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:21<00:00,  5.58it/s]


Epoch [1/10], Train Loss: 2591.5046, Train Accuracy: 25.54%, Test Accuracy: 25.36%
Epoch 2/10


Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1875/1875 [06:18<00:00,  4.95it/s]
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:21<00:00,  5.54it/s]


Epoch [2/10], Train Loss: 2586.9244, Train Accuracy: 25.91%, Test Accuracy: 25.80%
Epoch 3/10


Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1875/1875 [06:17<00:00,  4.96it/s]
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:21<00:00,  5.56it/s]


Epoch [3/10], Train Loss: 2585.4294, Train Accuracy: 25.81%, Test Accuracy: 25.75%
Epoch 4/10


Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1875/1875 [06:15<00:00,  4.99it/s]
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:21<00:00,  5.58it/s]


Epoch [4/10], Train Loss: 2584.2476, Train Accuracy: 25.94%, Test Accuracy: 25.80%
Epoch 5/10


Training:  40%|████████████████████████████████████████████████▌                                                                         | 746/1875 [02:30<03:47,  4.96it/s]


KeyboardInterrupt: 

In [None]:
!

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gensim.downloader as api
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

print("Loading Word2Vec embeddings...")
word2vec = api.load('word2vec-google-news-300')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load AG_NEWS dataset using Hugging Face datasets library
print("Loading AG_NEWS dataset...")
dataset = load_dataset('ag_news')

# Tokenizer function
def tokenize(text):
    return text.lower().split()

# Build vocabulary
def build_vocab(texts, tokenizer):
    vocab = {"<unk>": 0}
    for text in texts:
        for token in tokenizer(text):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Custom Dataset class
class AGNewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = [self.vocab[token] if token in self.vocab else self.vocab["<unk>"]
                          for token in self.tokenizer(text)]
        return torch.tensor(tokenized_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    label_list, text_list = [], []
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(_text)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    label_list = torch.tensor(label_list, dtype=torch.long)
    return text_list.to(device), label_list.to(device)

# Extract texts and labels
train_texts = [item['text'] for item in dataset['train']]
train_labels = [item['label'] for item in dataset['train']]
test_texts = [item['text'] for item in dataset['test']]
test_labels = [item['label'] for item in dataset['test']]

# Build vocabulary from training texts
print("Building vocabulary...")
vocab = build_vocab(train_texts, tokenize)

# Create Dataset and DataLoader
train_dataset = AGNewsDataset(train_texts, train_labels, vocab, tokenize)
test_dataset = AGNewsDataset(test_texts, test_labels, vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

# Create embedding matrix
print("Creating embeddings matrix...")
embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for word, idx in vocab.items():
    if word in word2vec:
        embedding_matrix[idx] = word2vec[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32).to(device)

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, drop_prob=0.5):
        super(LSTMClassifier, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (ht, ct) = self.lstm(x)
        out = self.dropout(ht[-1])
        out = self.fc(out)
        return out

hidden_dim = 256
output_dim = 4  # AG_NEWS has 4 classes
n_layers = 2
model = LSTMClassifier(embedding_matrix, hidden_dim, output_dim, n_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
print("Starting training...")
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for idx, (text, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if idx % 100 == 0:
            print(f'Epoch: {epoch+1}, Batch: {idx}, Loss: {loss.item()}')
    print(f'Epoch: {epoch+1}, Average Loss: {total_loss/len(train_loader)}')

    # Evaluate the model
    print("Evaluating the model...")
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (text, labels) in enumerate(test_loader):
            output = model(text)
            _, predicted = torch.max(output, 1)
            total_acc += (predicted == labels).sum().item()
            total_count += labels.size(0)
        print(f'Test Accuracy: {total_acc/total_count:.4f}')


Loading Word2Vec embeddings...
Using device: cuda
Loading AG_NEWS dataset...
Building vocabulary...
Creating embeddings matrix...
Starting training...
Epoch: 1, Batch: 0, Loss: 1.3962640762329102
Epoch: 1, Batch: 100, Loss: 1.3789664506912231
Epoch: 1, Batch: 200, Loss: 1.3892250061035156
Epoch: 1, Batch: 300, Loss: 1.3963209390640259
Epoch: 1, Batch: 400, Loss: 1.3627750873565674
Epoch: 1, Batch: 500, Loss: 1.2390058040618896
Epoch: 1, Batch: 600, Loss: 1.2338005304336548
Epoch: 1, Batch: 700, Loss: 1.2921500205993652
Epoch: 1, Batch: 800, Loss: 1.2750242948532104
Epoch: 1, Batch: 900, Loss: 1.2657489776611328
Epoch: 1, Batch: 1000, Loss: 1.27080500125885
Epoch: 1, Batch: 1100, Loss: 1.2660959959030151
Epoch: 1, Batch: 1200, Loss: 1.3710660934448242
Epoch: 1, Batch: 1300, Loss: 1.3817782402038574
Epoch: 1, Batch: 1400, Loss: 1.3726297616958618
Epoch: 1, Batch: 1500, Loss: 1.3053982257843018
Epoch: 1, Batch: 1600, Loss: 1.4326967000961304
Epoch: 1, Batch: 1700, Loss: 1.3224880695343018

In [11]:
text[0]

tensor([ 17872, 125152,  18559,  15120, 122869, 122868,  14556,   9574,     15,
           208,  18073,   9174,     39,      7,    465,   2379,     93,   2157,
             7,  17876,  17872,     93,     88, 158508,   2268,     62,      7,
          7767, 107695,    380,  11381,    736,   1774,  68858,   7254,     93,
            97,   6492,   1878,    495,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0], device='cuda:0')

In [19]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers, drop_prob=0.5):
        super(LSTMClassifier, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        print(x.shape)
        lstm_out, (ht, ct) = self.lstm(x)
        out = self.dropout(ht[-1])
        out = self.fc(out)
        return out

In [20]:
m1 = LSTMClassifier(embedding_matrix, hidden_dim, output_dim, n_layers).to(device)

In [21]:
m1(text)

torch.Size([16, 66, 300])


tensor([[ 0.0170, -0.0626,  0.0580,  0.0675],
        [ 0.0302, -0.0568,  0.0816,  0.0866],
        [ 0.0496,  0.0877,  0.0744,  0.1482],
        [ 0.0785,  0.0370, -0.0112,  0.0976],
        [-0.0072, -0.0847,  0.0542,  0.0955],
        [ 0.0820, -0.0085,  0.0478,  0.1129],
        [ 0.0838,  0.0043, -0.0123,  0.0477],
        [ 0.0580, -0.0325, -0.0113,  0.0906],
        [ 0.0537, -0.0645,  0.0122,  0.1611],
        [ 0.0893, -0.0263,  0.0266,  0.1330],
        [ 0.0778, -0.0523, -0.0048,  0.0621],
        [ 0.1100, -0.0733, -0.0012,  0.1339],
        [ 0.0419, -0.0561,  0.0796,  0.0509],
        [ 0.0657, -0.0634,  0.0956,  0.1878],
        [ 0.0790, -0.0569,  0.0845,  0.1133],
        [ 0.0727, -0.0475,  0.0944,  0.1565]], device='cuda:0',
       grad_fn=<AddmmBackward0>)