In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from tqdm import tqdm
from gensim.models import KeyedVectors
import copy
import gensim.downloader as api

# Load the Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Download NLTK data
nltk.download('punkt')

# Load the AG News dataset
dataset = load_dataset("ag_news")

# NLTK Tokenizer Function
def nltk_tokenizer(text):
    return word_tokenize(text.lower())

# Convert to PyTorch tensors
class AGNewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = [self.vocab.get(token, self.vocab['<UNK>']) for token in text]
        if len(tokenized_text) < self.max_length:
            tokenized_text += [self.vocab['<PAD>']] * (self.max_length - len(tokenized_text))
        else:
            tokenized_text = tokenized_text[:self.max_length]
        return torch.tensor(tokenized_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Build vocabulary
def build_vocab(dataset, tokenizer):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for example in dataset:
        tokens = tokenizer(example['text'])
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Tokenize and build vocab
train_texts = [nltk_tokenizer(example['text']) for example in dataset['train']]
train_labels = [example['label'] for example in dataset['train']]
vocab = build_vocab(dataset['train'], nltk_tokenizer)

# Set max length for padding
max_length = 128

# Create dataset
full_dataset = AGNewsDataset(train_texts, train_labels, vocab, max_length)

# Split training set into training and validation sets
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Prepare test dataset
test_texts = [nltk_tokenizer(example['text']) for example in dataset['test']]
test_labels = [example['label'] for example in dataset['test']]
test_dataset = AGNewsDataset(test_texts, test_labels, vocab, max_length)

# Initialize embedding matrix
def build_embedding_matrix(vocab, word2vec_model, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        if word in word2vec_model:
            embedding_matrix[idx] = word2vec_model[word]
        else:
            embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Build the embedding matrix
embedding_dim = 300  # Word2Vec uses 300-dimensional vectors
embedding_matrix = build_embedding_matrix(vocab, word2vec_model, embedding_dim)

class DeepLSTMModel(nn.Module):
    def __init__(self, vocab_size, output_dim, embedding_matrix):
        super(DeepLSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.size(1), 256, num_layers=3, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(256, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        return self.fc(hidden[-1])

# Training Parameters
BATCH_SIZE = 64
EPOCHS = 3
OUTPUT_DIM = 4
LR = 0.001
num_iterations = 100



# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model, loss function, and optimizer
model = DeepLSTMModel(len(vocab), OUTPUT_DIM, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_net = optim.Adam(model.parameters(), lr=LR)

# Synthetic data initialization
num_classes = 4
num_synthetic_per_class = 10


[nltk_data] Downloading package punkt to /home/IAIS/rrao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
synthetic_text_data = torch.randint(0, len(vocab), (num_classes * num_synthetic_per_class, max_length), dtype=torch.float, device=device, requires_grad=True)
synthetic_labels = torch.tensor([i for i in range(num_classes) for _ in range(num_synthetic_per_class)], dtype=torch.long, device=device)

# Optimizer for synthetic data
optimizer_syn = optim.SGD([synthetic_text_data], lr=0.01, momentum=0.9)

def compute_gradients(model, inputs, labels):
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    gradients = torch.autograd.grad(loss, model.parameters(), create_graph=True)
    return gradients

def layerwise_matching_loss(gw_syn, gw_real):
    loss = 0
    for g_syn, g_real in zip(gw_syn, gw_real):
        loss += ((g_syn - g_real) ** 2).sum()
    return loss

# Training loop with synthetic data gradient matching
for iteration in range(num_iterations):
    model.train()
    loss_avg = 0

    # Update synthetic data
    for real_inputs, real_labels in train_loader:
        real_inputs, real_labels = real_inputs.to(device), real_labels.to(device)

        # Compute gradients for real data
        gradients_real = compute_gradients(model, real_inputs, real_labels)
        
        # Compute gradients for synthetic data
        synthetic_data_batch = synthetic_text_data[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class].long()
        synthetic_labels_batch = synthetic_labels[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class]
        gradients_synthetic = compute_gradients(model, synthetic_data_batch, synthetic_labels_batch)
        
        # Compute and minimize matching loss
        loss_match = layerwise_matching_loss(gradients_synthetic, gradients_real)
        optimizer_syn.zero_grad()
        loss_match.backward()
        optimizer_syn.step()
        loss_avg += loss_match.item()

    # Print loss every 10 iterations
    if iteration % 10 == 0:
        print(f"Iteration {iteration}, Average Matching Loss: {loss_avg / len(train_loader):.4f}")

# Final evaluation on test set
model.eval()
test_labels = []
test_preds = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader, desc='Evaluating on test set', unit='batch'):
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

overall_accuracy = accuracy_score(test_labels, test_preds)
class_report = classification_report(test_labels, test_preds, target_names=['World', 'Sports', 'Business', 'Sci/Tech'])

print(f'Test Accuracy: {overall_accuracy:.4f}')
print('Classification Report:')
print(class_report)

NotImplementedError: the derivative for '_cudnn_rnn_backward' is not implemented. Double backwards is not supported for CuDNN RNNs due to limitations in the CuDNN API. To run double backwards, please disable the CuDNN backend temporarily while running the forward pass of your RNN. For example: 
with torch.backends.cudnn.flags(enabled=False):
    output = model(inputs)

In [None]:
synthetic_text_data = torch.randint(0, len(vocab), (num_classes * num_synthetic_per_class, max_length), dtype=torch.long, device=device, requires_grad=True)
synthetic_labels = torch.tensor([i for i in range(num_classes) for _ in range(num_synthetic_per_class)], dtype=torch.long, device=device)

# Optimizer for synthetic data
optimizer_syn = optim.SGD([synthetic_text_data], lr=0.01, momentum=0.9)

def compute_gradients(model, inputs, labels):
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    gradients = torch.autograd.grad(loss, model.parameters(), create_graph=True)
    return gradients

def layerwise_matching_loss(gw_syn, gw_real):
    loss = 0
    for g_syn, g_real in zip(gw_syn, gw_real):
        loss += ((g_syn - g_real) ** 2).sum()
    return loss

# Training loop with synthetic data gradient matching
for iteration in range(num_iterations):
    model.train()
    loss_avg = 0

    # Update synthetic data
    for real_inputs, real_labels in train_loader:
        real_inputs, real_labels = real_inputs.to(device), real_labels.to(device)

        # Compute gradients for real data
        gradients_real = compute_gradients(model, real_inputs, real_labels)
        
        # Compute gradients for synthetic data
        synthetic_data_batch = synthetic_text_data[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class]
        synthetic_labels_batch = synthetic_labels[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class]
        gradients_synthetic = compute_gradients(model, synthetic_data_batch, synthetic_labels_batch)
        
        # Compute and minimize matching loss
        loss_match = layerwise_matching_loss(gradients_synthetic, gradients_real)
        optimizer_syn.zero_grad()
        loss_match.backward()
        optimizer_syn.step()
        loss_avg += loss_match.item()

    # Print loss every 10 iterations
    if iteration % 10 == 0:
        print(f"Iteration {iteration}, Average Matching Loss: {loss_avg / len(train_loader):.4f}")

# Final evaluation on test set
model.eval()
test_labels = []
test_preds = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader, desc='Evaluating on test set', unit='batch'):
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

overall_accuracy = accuracy_score(test_labels, test_preds)
class_report = classification_report(test_labels, test_preds, target_names=['World', 'Sports', 'Business', 'Sci/Tech'])

print(f'Test Accuracy: {overall_accuracy:.4f}')
print('Classification Report:')
print(class_report)


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from tqdm import tqdm
from gensim.models import KeyedVectors

# Load the Word2Vec model
import gensim.downloader as api

# Download the Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Download NLTK data
nltk.download('punkt')

# Load the AG News dataset
dataset = load_dataset("ag_news")



[nltk_data] Downloading package punkt to /home/IAIS/rrao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# NLTK Tokenizer Function
def nltk_tokenizer(text):
    return word_tokenize(text.lower())

# Convert to PyTorch tensors
class AGNewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = [self.vocab.get(token, self.vocab['<UNK>']) for token in text]
        if len(tokenized_text) < self.max_length:
            tokenized_text += [self.vocab['<PAD>']] * (self.max_length - len(tokenized_text))
        else:
            tokenized_text = tokenized_text[:self.max_length]
        return torch.tensor(tokenized_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Build vocabulary
def build_vocab(dataset, tokenizer):
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for example in dataset:
        tokens = tokenizer(example['text'])
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Tokenize and build vocab
train_texts = [nltk_tokenizer(example['text']) for example in dataset['train']]
train_labels = [example['label'] for example in dataset['train']]
vocab = build_vocab(dataset['train'], nltk_tokenizer)

# Set max length for padding
max_length = 128

# Create dataset
full_dataset = AGNewsDataset(train_texts, train_labels, vocab, max_length)

# Split training set into training and validation sets
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Prepare test dataset
test_texts = [nltk_tokenizer(example['text']) for example in dataset['test']]
test_labels = [example['label'] for example in dataset['test']]
test_dataset = AGNewsDataset(test_texts, test_labels, vocab, max_length)


  synthetic_labels = torch.tensor([np.ones(num_synthetic_per_class) * i for i in range(num_classes)], dtype=torch.long, device=device).view(-1)


In [18]:

from tqdm import tqdm

# Training loop with synthetic data gradient matching
for iteration in range(num_iterations):
    model.train()
    for real_inputs, real_labels in tqdm(train_loader):
        real_inputs, real_labels = real_inputs.to(device), real_labels.to(device)

        # Compute gradients for real data
        gradients_real = compute_gradients(model, real_inputs, real_labels)
        
        # Compute gradients for synthetic data
        synthetic_data_batch = synthetic_text_data[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class]
        synthetic_labels_batch = synthetic_labels[iteration % num_classes * num_synthetic_per_class: (iteration % num_classes + 1) * num_synthetic_per_class]
        gradients_synthetic = compute_gradients(model, synthetic_data_batch, synthetic_labels_batch)
        
        # Compute and minimize matching loss
        loss_match = layerwise_matching_loss(gradients_synthetic, gradients_real)
        optimizer_syn.zero_grad()
        loss_match.backward()
        optimizer_syn.step()

    print(f"Iteration {iteration}, Matching Loss: {loss_match.item()}")

# Final evaluation on test set
model.eval()
test_labels = []
test_preds = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader, desc='Evaluating on test set', unit='batch'):
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

overall_accuracy = accuracy_score(test_labels, test_preds)
class_report = classification_report(test_labels, test_preds, target_names=['World', 'Sports', 'Business', 'Sci/Tech'])

print(f'Test Accuracy: {overall_accuracy:.4f}')
print('Classification Report:')
print(class_report)


  0%|                                                                                                                                              | 0/1500 [00:02<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)