In [13]:
pip install transformers torch datasets




In [17]:
# Required Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer
from datasets import load_dataset  # Correct import for load_dataset

# Load the SQuAD dataset
dataset = load_dataset("squad")

# Extract contexts and questions from the dataset
contexts = [item['context'] for item in dataset['train']]
questions = [item['question'] for item in dataset['train']]

# Display the first few contexts and questions to verify
for i in range(3):
    print(f"Context {i}: {contexts[i]}")
    print(f"Question {i}: {questions[i]}\n")

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create a custom Dataset class for Question Generation
class QuestionGenerationDataset(Dataset):
    def __init__(self, contexts, questions, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]

        # Tokenize context
        inputs = self.tokenizer(
            context,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize question
        question_inputs = self.tokenizer(
            question,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': question_inputs['input_ids'].squeeze(0)  # Output as labels for training
        }

# Create the dataset
qg_dataset = QuestionGenerationDataset(contexts[:100], questions[:100], tokenizer)

# Create DataLoader
train_loader = DataLoader(qg_dataset, batch_size=16, shuffle=True)

# Define the LSTM-based Question Generation model
class LSTMQuestionGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMQuestionGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, hidden):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output, (hidden, cell)

# Initialize model parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
output_dim = vocab_size  # Same as vocab size for output layer

# Create the model
model = LSTMQuestionGenerator(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        labels = batch['labels']

        # Initialize hidden state
        hidden = (torch.zeros(1, input_ids.size(0), hidden_dim),
                  torch.zeros(1, input_ids.size(0), hidden_dim))

        # Forward pass
        outputs, hidden = model(input_ids, hidden)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")


Context 0: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question 0: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Context 1: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper 

In [18]:
def generate_question(model, context, tokenizer, max_length=30):
    model.eval()
    with torch.no_grad():
        # Tokenize context
        inputs = tokenizer(context, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        input_ids = inputs['input_ids']  # Shape: (1, seq_length)
        
        # Initialize hidden state
        hidden = (torch.zeros(1, input_ids.size(0), hidden_dim),
                  torch.zeros(1, input_ids.size(0), hidden_dim))
        
        # Generate output tokens
        generated = []
        for _ in range(max_length):
            output, hidden = model(input_ids, hidden)
            next_token = torch.argmax(output[:, -1, :], dim=1)  # Get the most likely next token
            
            generated.append(next_token.item())
            input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)  # Append token to input
            
            # Break if end token is generated
            if next_token.item() == tokenizer.eos_token_id: 
                break

    # Convert token IDs to question
    return tokenizer.decode(generated, skip_special_tokens=True)


Predicted Question: the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
