In [13]:
pip install transformers torch datasets




In [None]:
# Required Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
import torch.nn.functional as F

# Load the SQuAD dataset
dataset = load_dataset("squad")

# Extract contexts and questions from the dataset
contexts = [item['context'] for item in dataset['train']]
questions = [item['question'] for item in dataset['train']]

# Display the first few contexts and questions to verify
for i in range(3):
    print(f"Context {i}: {contexts[i]}")
    print(f"Question {i}: {questions[i]}\n")

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create a custom Dataset class for Question Generation
class QuestionGenerationDataset(Dataset):
    def __init__(self, contexts, questions, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]

        # Tokenize context
        inputs = self.tokenizer(
            context,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize question
        question_inputs = self.tokenizer(
            question,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': question_inputs['input_ids'].squeeze(0)  # Output as labels for training
        }

# Create the dataset
qg_dataset = QuestionGenerationDataset(contexts[:100], questions[:100], tokenizer)

# Create DataLoader
train_loader = DataLoader(qg_dataset, batch_size=16, shuffle=True)

# Define the LSTM-based Question Generation model
class LSTMQuestionGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMQuestionGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, hidden):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output, (hidden, cell)

# Initialize model parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
output_dim = vocab_size  # Same as vocab size for output layer

# Create the model
model = LSTMQuestionGenerator(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        labels = batch['labels']

        # Initialize hidden state
        hidden = (torch.zeros(1, input_ids.size(0), hidden_dim),
                  torch.zeros(1, input_ids.size(0), hidden_dim))

        # Forward pass
        outputs, hidden = model(input_ids, hidden)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

# Function to generate questions
def generate_question(model, context, tokenizer, max_length=30, temperature=1.0):
    model.eval()
    with torch.no_grad():
        # Tokenize context
        inputs = tokenizer(context, return_tensors='pt', padding='longest', truncation=True, max_length=512)
        input_ids = inputs['input_ids']  # Shape: (1, seq_length)

        # Initialize hidden state
        hidden = (torch.zeros(1, input_ids.size(0), hidden_dim),
                  torch.zeros(1, input_ids.size(0), hidden_dim))

        # Generate output tokens
        generated = []
        for _ in range(max_length):
            output, hidden = model(input_ids, hidden)

            # Sample from the distribution with temperature
            output_dist = output[:, -1, :] / temperature
            next_token = torch.multinomial(F.softmax(output_dist, dim=-1), num_samples=1)

            # Ensure next_token is 2D for concatenation
            next_token = next_token.squeeze(1)  # Shape: (1,)

            # Append token to generated list
            generated.append(next_token.item())
            input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)  # Shape: (1, seq_length + 1)

            # Break if end token is generated
            if next_token.item() == tokenizer.eos_token_id:
                break

    # Convert token IDs to question
    return tokenizer.decode(generated, skip_special_tokens=True)

# Function to evaluate generated questions
def evaluate_question_generation(predicted, actual):
    # Tokenize predicted and actual questions
    predicted_tokens = predicted.split()
    actual_tokens = actual.split()

    # Calculate BLEU score
    score = sentence_bleu([actual_tokens], predicted_tokens)
    return score

# Example usage
context_example = contexts[0]  # Example context
actual_question = questions[0]  # Actual question
predicted_question = generate_question(model, context_example, tokenizer)
bleu_score = evaluate_question_generation(predicted_question, actual_question)

print("Predicted Question:", predicted_question)
print("Actual Question:", actual_question)
print("BLEU Score:", bleu_score)
