# without attention


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import json
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import random
import time
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json

dataset_path = "/content/space_qa_dataset_large.json"

with open(dataset_path, 'r') as file:
    data = json.load(file)


In [None]:
import nltk
import os
# Add this block early in your code
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True, raise_on_error=True)
# Download the missing 'punkt_tab' resource
try:
    nltk.data.find('tokenizers/punkt_tab')  # Check if 'punkt_tab' is already present
except LookupError:
    nltk.download('punkt_tab', quiet=True, raise_on_error=True)  # Download if not present

In [None]:
# Hyperparameters
MAX_LEN = 100
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 2
EPOCHS = 10

In [None]:
# Tokenization and vocabulary building
class Vocab:
    def __init__(self):
        self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.counter = Counter()

    def build_vocab(self, sentences, max_size=10000):
        for sent in sentences:
            tokens = word_tokenize(sent.lower())
            self.counter.update(tokens)
        for word, _ in self.counter.most_common(max_size):
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def encode(self, sentence):
        return [self.word2idx.get(w, 3) for w in word_tokenize(sentence.lower())] + [2]

    def decode(self, indices):
        return ' '.join([self.idx2word[i] for i in indices if i != 0 and i != 2])

In [None]:
# Build vocab from context + questions + answers
vocab = Vocab()
contexts, questions, answers = [], [], []
for d in data['data']:
    for para in d['paragraphs']:
        context = para['context']
        for qa in para['qas']:
            contexts.append(context)
            questions.append(qa['question'])
            answers.append(qa['answers'][0]['text'])

vocab.build_vocab(contexts + questions + answers)

In [None]:
# Dataset
class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, vocab):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.vocab = vocab

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        input_seq = self.vocab.encode(self.contexts[idx] + ' ' + self.questions[idx])[:MAX_LEN]
        target_seq = [1] + self.vocab.encode(self.answers[idx])[:MAX_LEN]  # Start with <SOS>
        return torch.tensor(input_seq), torch.tensor(target_seq)

train_data = QADataset(contexts, questions, answers, vocab)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: x)

In [None]:
# Pad function
def pad_batch(batch):
    input_seqs = [item[0] for item in batch]
    target_seqs = [item[1] for item in batch]
    input_seqs = nn.utils.rnn.pad_sequence(input_seqs, batch_first=True)
    target_seqs = nn.utils.rnn.pad_sequence(target_seqs, batch_first=True)
    return input_seqs, target_seqs

In [None]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)  # (batch, 1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

In [None]:
# Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs


In [None]:
# Initialize
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(len(vocab.word2idx), EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
decoder = Decoder(len(vocab.word2idx), EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters())

# Training loop with timing
for epoch in range(EPOCHS):
    model.train()
    start_time = time.time()
    epoch_loss = 0
    for batch in train_loader:
        src, trg = pad_batch(batch)
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    duration = time.time() - start_time
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Training Time: {duration:.2f}s")

# Save the model
torch.save(model.state_dict(), "qa_model.pt")

# Inference and evaluation

def predict(model, context, question, vocab, max_len=20):
    model.eval()
    input_seq = vocab.encode(context + ' ' + question)
    input_tensor = torch.tensor(input_seq).unsqueeze(0).to(DEVICE)
    hidden, cell = model.encoder(input_tensor)
    input_token = torch.tensor([1]).to(DEVICE)
    result = []
    for _ in range(max_len):
        output, hidden, cell = model.decoder(input_token, hidden, cell)
        top1 = output.argmax(1)
        if top1.item() == 2:
            break
        result.append(top1.item())
        input_token = top1
    return vocab.decode(result)

Epoch 1, Loss: 486.8393, Training Time: 3.14s
Epoch 2, Loss: 320.5604, Training Time: 4.09s
Epoch 3, Loss: 174.9938, Training Time: 2.96s
Epoch 4, Loss: 72.7398, Training Time: 2.96s
Epoch 5, Loss: 29.0133, Training Time: 2.72s
Epoch 6, Loss: 15.5804, Training Time: 3.26s
Epoch 7, Loss: 8.7898, Training Time: 3.55s
Epoch 8, Loss: 5.9210, Training Time: 3.56s
Epoch 9, Loss: 4.2779, Training Time: 4.04s
Epoch 10, Loss: 3.3302, Training Time: 3.43s


In [None]:
# Interactive Q&A loop
print("\n=== Interactive QA ===")
while True:
    context = input("Enter context (or type 'exit' to quit):\n")
    if context.lower() == 'exit':
        break
    question = input("Enter your question:\n")
    answer = predict(model, context, question, vocab)
    print(f"Answer: {answer}\n")


=== Interactive QA ===
Enter context (or type 'exit' to quit):
planet
Enter your question:
Which planet has a thick toxic atmosphere?
Answer: venus

Enter context (or type 'exit' to quit):
exit


In [None]:
!pip install rouge
!pip install torchinfo

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
!pip install rouge-score
import json
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import torch
import time

nltk.download('punkt')

# Load your knowledge base JSON
with open("space_qa_dataset_large.json") as f:
    data = json.load(f)

# Reconstruct evaluation set
eval_set = []
for d in data["data"]:
    for para in d["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            answer = qa["answers"][0]["text"]
            eval_set.append((context, question, answer))

# Evaluation metrics setup
bleu_scores, meteor_scores, rouge_l_scores = [], [], []
total_time = 0

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smoothie = SmoothingFunction().method4

for context, question, reference in eval_set[:50]:  # Limit to 50 for speed
    start = time.time()
    prediction = predict(model, context, question, vocab)
    total_time += time.time() - start

    # Tokenize
    ref_tokens = nltk.word_tokenize(reference.lower())
    pred_tokens = nltk.word_tokenize(prediction.lower())

    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)
    meteor = meteor_score([word_tokenize(reference.lower())], word_tokenize(prediction.lower()))
    rougeL = scorer.score(reference, prediction)['rougeL'].fmeasure

    bleu_scores.append(bleu)
    meteor_scores.append(meteor)
    rouge_l_scores.append(rougeL)

# Display scores
print(f"\nEvaluation on {len(bleu_scores)} QA pairs:")
print(f"Average BLEU:   {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"Average METEOR: {sum(meteor_scores)/len(meteor_scores):.4f}")
print(f"Average ROUGE-L:{sum(rouge_l_scores)/len(rouge_l_scores):.4f}")
print(f"Avg Inference Time: {total_time/len(bleu_scores):.4f} sec per sample")



Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bbcddaef107882c8c9a1091092b68f39e6e52a40b87e64cf11320cbed26180c9
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Evaluation on 50 QA pairs:
Average BLEU:   0.8981
Average METEOR: 0.8106
Average ROUGE-L:1.0000
Avg Inference Time: 0.0370 sec per sample


# With attention (Uploaded separately on github with run cells and output)

In [None]:

import json
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

def load_data(filename="space_qa_dataset_large.json"):
    """Loads the dataset from a JSON file."""
    with open(filename, 'r') as f:
        data = json.load(f)
    return data['data']

def preprocess_data(data):
    """
    Preprocesses the data to extract context, questions, and answers.
    Also, cleans the text and creates start/end token around answer.
    """

    contexts = []
    questions = []
    answers_text = [] # Store original answer texts
    answer_starts = [] # Store start positions of answers

    for topic in data:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]  # Assuming one answer per question
                answer_text = answer['text']
                answer_start = context.find(answer_text)

                contexts.append(context)
                questions.append(question)
                answers_text.append(answer_text)
                answer_starts.append(answer_start)

    # Cleaning text (can be expanded)
    def clean_text(text):
        text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
        text = text.lower()
        return text

    contexts = [clean_text(c) for c in contexts]
    questions = [clean_text(q) for q in questions]

    # Create "target" which are the contexts with start/end tokens around answers
    targets = []
    for i in range(len(contexts)):
        target = contexts[i]
        start_idx = answer_starts[i]
        end_idx = start_idx + len(answers_text[i])
        target = target[:start_idx] + " <start> " + answers_text[i] + " <end> " + target[end_idx:]
        targets.append(clean_text(target)) # Clean the target as well

    return contexts, questions, targets

def tokenize_and_pad(contexts, questions, targets, max_len=100):
    """
    Tokenizes the contexts, questions, and answers and pads them to a maximum length.
    """

    # Tokenize contexts
    tokenizer_context = Tokenizer(oov_token="<OOV>")
    tokenizer_context.fit_on_texts(contexts)
    context_sequences = tokenizer_context.texts_to_sequences(contexts)
    padded_contexts = pad_sequences(context_sequences, maxlen=max_len, padding='post', truncating='post')

    # Tokenize questions
    tokenizer_question = Tokenizer(oov_token="<OOV>")
    tokenizer_question.fit_on_texts(questions)
    question_sequences = tokenizer_question.texts_to_sequences(questions)
    padded_questions = pad_sequences(question_sequences, maxlen=max_len, padding='post', truncating='post')

    # Tokenize targets
    tokenizer_target = Tokenizer(oov_token="<OOV>", filters='') # Important: No filters for <start> and <end>
    tokenizer_target.fit_on_texts(targets)
    target_sequences = tokenizer_target.texts_to_sequences(targets)
    padded_targets = pad_sequences(target_sequences, maxlen=max_len, padding='post', truncating='post')

    return padded_contexts, padded_questions, padded_targets, tokenizer_context, tokenizer_question, tokenizer_target

# --- Main execution ---
data = load_data()
contexts, questions, targets = preprocess_data(data)

# You can adjust max_len as needed
padded_contexts, padded_questions, padded_targets, tokenizer_context, tokenizer_question, tokenizer_target = tokenize_and_pad(
    contexts, questions, targets, max_len=200
)

# Vocabulary sizes (for embedding layers)
context_vocab_size = len(tokenizer_context.word_index) + 1
question_vocab_size = len(tokenizer_question.word_index) + 1
target_vocab_size = len(tokenizer_target.word_index) + 1

# Split data (adjust test_size as needed)
context_train, context_test, question_train, question_test, target_train, target_test = train_test_split(
    padded_contexts, padded_questions, padded_targets, test_size=0.2, random_state=42
)

print("Data Preprocessing Done.")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model

def create_model(context_vocab_size, question_vocab_size, target_vocab_size, max_len, embedding_dim=64, lstm_units=256):
    """
    Creates the LSTM encoder-decoder model with Bahdanau attention (fixed dimension issue).

    Args:
        context_vocab_size: Vocabulary size of the context.
        question_vocab_size: Vocabulary size of the question.
        target_vocab_size: Vocabulary size of the target.
        max_len: Maximum sequence length.
        embedding_dim: Dimensionality of the embedding layers.
        lstm_units: Number of units in the LSTM layers.

    Returns:
        The compiled Keras model.
    """

    # ------------------- Encoder -------------------
    # Context input
    context_input = Input(shape=(max_len,), name='context_input')
    context_embedding = Embedding(context_vocab_size, embedding_dim, name='context_embedding')(context_input)
    context_encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name='context_encoder_lstm')(context_embedding)
    context_encoder_outputs, context_encoder_h, context_encoder_c = context_encoder_lstm

    # Question input
    question_input = Input(shape=(max_len,), name='question_input')
    question_embedding = Embedding(question_vocab_size, embedding_dim, name='question_embedding')(question_input)
    question_encoder_lstm = LSTM(lstm_units, return_state=True, name='question_encoder_lstm')(question_embedding)
    question_encoder_outputs_q, question_encoder_h, question_encoder_c = question_encoder_lstm # Separate output for question

    # Concatenate the final states of both encoders to initialize the decoder
    encoder_final_state_h = Concatenate(axis=-1, name='encoder_final_state_h')([context_encoder_h, question_encoder_h])
    encoder_final_state_c = Concatenate(axis=-1, name='encoder_final_state_c')([context_encoder_c, question_encoder_c])

    # Project encoder outputs to match decoder output dimension
    encoder_outputs_processed = Dense(lstm_units * 2, activation='relu', name='encoder_output_projection')(context_encoder_outputs)

    # ------------------- Decoder -------------------
    target_input = Input(shape=(max_len,), name='target_input')
    target_embedding = Embedding(target_vocab_size, embedding_dim, name='target_embedding')(target_input)
    decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, return_state=True, name='decoder_lstm')(
        target_embedding, initial_state=[encoder_final_state_h, encoder_final_state_c]
    )
    decoder_outputs, _, _ = decoder_lstm

    # ------------------- Attention Mechanism (Bahdanau) -------------------
    attention = Attention(name='attention')([decoder_outputs, encoder_outputs_processed])  # decoder_outputs (query), processed encoder_outputs (value)
    context_vector = Concatenate(axis=-1, name='context_vector')([decoder_outputs, attention])

    # Output layer
    output = Dense(target_vocab_size, activation='softmax', name='output')(context_vector)

    # Define the model
    model = Model(inputs=[context_input, question_input, target_input], outputs=output) # Only output prediction here

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# --- Re-create and Print Model Summary ---
model = create_model(
    context_vocab_size, question_vocab_size, target_vocab_size, max_len=padded_contexts.shape[1],
    embedding_dim=128, lstm_units=256 # Using the same units
)
model.summary()

print("Model Architecture Defined.")

In [None]:
epochs = 50
batch_size = 64

history = model.fit(
    [context_train, question_train, target_train],
    np.expand_dims(target_train, axis=-1),
    epochs=epochs,
    batch_size=batch_size,
    validation_data=([context_test, question_test, target_test], np.expand_dims(target_test, axis=-1))
)

import matplotlib.pyplot as plt

# --- Plotting training history ---
def plot_training_history(history):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

plot_training_history(history)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import numpy as np  # Import numpy
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict(model, context, question, tokenizer_context, tokenizer_question, tokenizer_target, max_len):  # Remove default max_len
    """
    Generates an answer given a context and a question.

    Args:
        model: The trained Keras model.
        context: The input context (string).
        question: The input question (string).
        tokenizer_context: Tokenizer for the context.
        tokenizer_question: Tokenizer for the question.
        tokenizer_target: Tokenizer for the target.
        max_len: Maximum length of the input sequences (from training).

    Returns:
        The predicted answer (string).
    """

    # Preprocess the input
    context = clean_text(context)
    question = clean_text(question)

    context_seq = tokenizer_context.texts_to_sequences([context])
    padded_context = pad_sequences(context_seq, maxlen=max_len, padding='post', truncating='post')

    question_seq = tokenizer_question.texts_to_sequences([question])
    padded_question = pad_sequences(question_seq, maxlen=max_len, padding='post', truncating='post')

    # Prepare the target input (start with <start> token)
    target_seq = tokenizer_target.texts_to_sequences(["<start>"])
    padded_target = pad_sequences(target_seq, maxlen=max_len, padding='post', truncating='post')

    # Prediction loop
    for i in range(max_len - 1):  # Change the loop condition
        prediction = model.predict([padded_context, padded_question, padded_target], verbose=0)
        print(f"Prediction shape: {prediction.shape}, i: {i}")  # Debugging: Check prediction shape
        print(f"Raw prediction: {prediction[0, i, :10]}")  # Debugging: Check raw output
        predicted_token_index = np.argmax(prediction[0, i])
        predicted_token = tokenizer_target.index_word.get(predicted_token_index, "<OOV>")

        if predicted_token == "<end>":
            break

        padded_target[0, i + 1] = predicted_token_index

    # Decode the predicted sequence
    predicted_answer_tokens = [tokenizer_target.index_word.get(idx, "<OOV>") for idx in padded_target[0] if idx != 0]
    predicted_answer = " ".join(predicted_answer_tokens)
    predicted_answer = predicted_answer.replace("<start> ", "").replace(" <end>", "").strip()

    return predicted_answer

print("Prediction Function Defined.")

# --- Example Usage for Visualization ---
context_example = "The solar system has eight planets: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Mercury is the closest to the Sun."
question_example = "What is the closest planet to the Sun?"
target_example = "mercury"  # Cleaned version

predicted_answer_example = predict(model, context_example, question_example, tokenizer_context, tokenizer_question, tokenizer_target, max_len=padded_contexts.shape[1])

print("--- Prediction Debugging ---")
print(f"Context: {context_example}")
print(f"Question: {question_example}")
print(f"Predicted Answer: {predicted_answer_example}")
print("--------------------------")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

def visualize_attention(context, question, predicted_answer, attention_scores, tokenizer_context, tokenizer_target):
    """
    Visualizes the attention weights.

    Args:
        context: The input context (string).
        question: The input question (string).
        predicted_answer: The predicted answer (string).
        attention_scores: The attention scores from the model.
        tokenizer_context: Tokenizer for the context.
        tokenizer_target: Tokenizer for the target.
    """

    context_tokens = tokenizer_context.texts_to_sequences([context])[0]
    context_words = [tokenizer_context.index_word.get(idx, "<OOV>") for idx in context_tokens if idx != 0]

    predicted_answer_tokens = tokenizer_target.texts_to_sequences([predicted_answer])[0]
    predicted_answer_words = [tokenizer_target.index_word.get(idx, "<OOV>") for idx in predicted_answer_tokens if idx != 0]

    # Assuming attention_scores has shape (batch, target_seq_len, context_seq_len)
    attention_matrix = attention_scores[0]

    plt.figure(figsize=(10, 8))
    plt.imshow(attention_matrix, cmap='viridis')
    plt.xticks(range(len(context_words)), context_words, rotation='vertical')
    plt.yticks(range(len(predicted_answer_words)), predicted_answer_words)
    plt.xlabel("Context Words")
    plt.ylabel("Predicted Answer Words")
    plt.title("Attention Visualization")
    plt.colorbar()
    plt.tight_layout()
    plt.show()

# --- Example Usage for Visualization ---
context_example = "The solar system has eight planets: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Mercury is the closest to the Sun."
question_example = "What is the closest planet to the Sun?"
target_example = "mercury" # Cleaned version

predicted_answer_example = predict(model, context_example, question_example, tokenizer_context, tokenizer_question, tokenizer_target, max_len=padded_contexts.shape[1])

# Get attention scores
context_seq = tokenizer_context.texts_to_sequences([context_example])
padded_context_example = pad_sequences(context_seq, maxlen=padded_contexts.shape[1], padding='post', truncating='post')

question_seq = tokenizer_question.texts_to_sequences([question_example])
padded_question_example = pad_sequences(question_seq, maxlen=padded_questions.shape[1], padding='post', truncating='post')

target_seq = tokenizer_target.texts_to_sequences(["<start>"])
padded_target_example = pad_sequences(target_seq, maxlen=padded_targets.shape[1], padding='post', truncating='post')

# We need to run the prediction step by step to get attention scores for each output token
predicted_answer_tokens_with_start = ["<start>"]
attention_scores_list = []

# Create a new model to get attention scores
attention_layer = model.get_layer('attention')
attention_model_for_prediction = Model(inputs=model.input, outputs=[model.output, attention_layer.output])

for i in range(50): # max_len
    target_input_seq = tokenizer_target.texts_to_sequences([predicted_answer_tokens_with_start[-1]])
    padded_target_input = pad_sequences(target_input_seq, maxlen=padded_targets.shape[1], padding='post', truncating='post')

    output, attention_output = attention_model_for_prediction.predict([padded_context_example, padded_question_example, padded_target_input], verbose=0)
    predicted_token_index = np.argmax(output[0, 0])
    predicted_token = tokenizer_target.index_word.get(predicted_token_index, "<OOV>")

    attention_scores_list.append(attention_output[0])  # Store attention scores for this step

    predicted_answer_tokens_with_start.append(predicted_token)

    if predicted_token == "<end>":
        break

predicted_answer_for_visualization = " ".join(predicted_answer_tokens_with_start[1:-1]) # Exclude <start> and <end>

if attention_scores_list:
    visualize_attention(context_example, question_example, predicted_answer_for_visualization, np.array(attention_scores_list), tokenizer_context, tokenizer_target)
else:
    print("Attention scores were not retrieved.")

print("Attention Visualization Done.")

In [None]:
from collections import Counter
import re
import time
import numpy as np
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
import nltk
import nltk.tokenize

nltk.download('wordnet')
nltk.download('punkt')

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    if not isinstance(s, str):
        print(f"normalize_answer: Input is not a string, converting: {type(s)}")
        s = str(s)
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        return re.sub(r'[^\w\s]', '', text)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, ground_truth):
    print(f"compute_exact_match: prediction type = {type(prediction)}, ground_truth type = {type(ground_truth)}")
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_f1(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def compute_rouge(prediction, ground_truth):
    rouge = Rouge()
    try:
        scores = rouge.get_scores(prediction, ground_truth)
        return scores[0]['rouge-l']['f']
    except ValueError:
        return 0.0

def compute_bleu(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = [normalize_answer(ground_truth).split()]
    try:
        return sentence_bleu(ground_truth_tokens, prediction_tokens)
    except ValueError:
        return 0.0

def compute_meteor(prediction, ground_truth):
    try:
        prediction_tokens = nltk.tokenize.word_tokenize(normalize_answer(prediction))
        print(f"compute_meteor: prediction_tokens type = {type(prediction_tokens)}, ground_truth type = {type(ground_truth)}")
        return single_meteor_score(normalize_answer(ground_truth), prediction_tokens)
    except (LookupError, ValueError, TypeError) as e:
        print(f"compute_meteor error: {e}")
        return 0.0

def find_answer_span(context, target):
    """Finds the span of the answer within the context, handling errors."""
    if not isinstance(context, str):
        print(f"find_answer_span: context is not a string, converting: {type(context)}")
        context = str(context)
    if not isinstance(target, str):
        print(f"find_answer_span: target is not a string, converting: {type(target)}")
        target = str(target)
    context = normalize_answer(context)
    target = normalize_answer(target)
    start_idx = context.find(target)
    if start_idx == -1:
        context_tokens = context.split()
        target_tokens = target.split()
        best_start_idx = -1
        max_match = 0
        for i in range(len(context_tokens) - len(target_tokens) + 1):
            match_count = sum(1 for j in range(len(target_tokens)) if context_tokens[i + j] == target_tokens[j])
            if match_count > max_match:
                max_match = match_count
                best_start_idx = len(" ".join(context_tokens[:i])) + (1 if i > 0 else 0)

        if best_start_idx == -1:
            return ""
        else:
            return context[best_start_idx:best_start_idx + len(target)]
    else:
        return context[start_idx:start_idx + len(target)]

def evaluate(model, contexts, questions, answers, tokenizer_context, tokenizer_question, tokenizer_target, max_len):
    """
    Evaluates the model on the given data, including ROUGE, BLEU, METEOR, and inference time.
    """

    exact_match_scores = []
    f1_scores = []
    rouge_scores = []
    bleu_scores = []
    meteor_scores = []
    inference_times = []

    num_examples = 5  # Reduce to 5 for even faster debugging

    for i in range(min(num_examples, len(questions))):
        context = str(contexts[i])
        question = str(questions[i])
        ground_truth = str(answers[i])

        print(f"evaluate: Processing example {i}, context type = {type(context)}, question type = {type(question)}, ground_truth type = {type(ground_truth)}")

        start_time = time.time()
        predicted_answer = predict(model, context, question, tokenizer_context, tokenizer_question, tokenizer_target, max_len)
        end_time = time.time()
        inference_time = end_time - start_time

        answer_span = find_answer_span(context, ground_truth)

        # *** INSPECTION: Print predicted and ground truth answers ***
        print(f"  --- Example {i} ---")
        print(f"  Predicted Answer: '{predicted_answer}'")
        print(f"  Ground Truth Span: '{answer_span}'")
        # *********************************************************

        exact_match = compute_exact_match(predicted_answer, answer_span)
        f1 = compute_f1(predicted_answer, answer_span)
        rouge = compute_rouge(predicted_answer, answer_span)
        bleu = compute_bleu(predicted_answer, answer_span)
        meteor = compute_meteor(predicted_answer, answer_span)

        exact_match_scores.append(exact_match)
        f1_scores.append(f1)
        rouge_scores.append(rouge)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        inference_times.append(inference_time)

    average_exact_match = sum(exact_match_scores) / len(exact_match_scores) if exact_match_scores else 0
    average_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
    average_rouge = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0
    average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
    average_inference_time = sum(inference_times) / len(inference_times) if inference_times else 0

    return {
        'exact_match': average_exact_match,
        'f1': average_f1,
        'rouge': average_rouge,
        'bleu': average_bleu,
        'meteor': average_meteor,
        'average_inference_time': average_inference_time
    }

# --- Example Usage ---
# Assuming you have context_test, question_test, answers_test
# which are lists of strings corresponding to your test set.
answers_test = []
num_examples = 5  # Reduce for consistency
for i in range(min(num_examples, len(context_test))):
    context = str(context_test[i])
    target = str(targets[i])
    answers_test.append(target)

# Slice the test data to match the number of examples we're evaluating
context_test_subset = context_test[:num_examples]
question_test_subset = question_test[:num_examples]
answers_test_subset = answers_test[:num_examples]

evaluation_results = evaluate(model, context_test_subset, question_test_subset, answers_test_subset, tokenizer_context, tokenizer_question, tokenizer_target, max_len=padded_contexts.shape[1])

print("Evaluation Results:")
print(f"Exact Match: {evaluation_results['exact_match']:.4f}")
print(f"F1 Score: {evaluation_results['f1']:.4f}")
print(f"ROUGE-L: {evaluation_results['rouge']:.4f}")
print(f"BLEU: {evaluation_results['bleu']:.4f}")
print(f"METEOR: {evaluation_results['meteor']:.4f}")
print(f"Average Inference Time: {evaluation_results['average_inference_time']:.4f} seconds")

# Self Attention



In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0424c8ae1d3d1a51dc122c6a2daa4ffe100883e2f045f75ae777b3808563b0be
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# run_space_qa.py

import math
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from tqdm import tqdm

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install nltk
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
def tokenize_text(text):
    """Tokenizes a text string using NLTK's word_tokenize."""
    return nltk.word_tokenize(text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

# ---------------------------
# Hyperparameters
# ---------------------------
embed_size = 128
num_heads = 4
num_layers = 2
hidden_size = 256
dropout = 0.1
learning_rate = 0.001
weight_decay = 1e-5
max_len = 50
batch_size = 16
num_epochs = 10
grad_clip = 1.0

# ---------------------------
# Text Preprocessing
# ---------------------------
def pad_sequence(seq, max_len):
    # If sequence is longer than max_len, truncate it
    if len(seq) > max_len:
        seq = seq[:max_len]
    # Pad if sequence is shorter than max_len
    return seq + [0] * (max_len - len(seq))

def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        vocab.update(tokenize_text(sentence))
    vocab = {word: idx+1 for idx, word in enumerate(vocab)}
    vocab['<pad>'] = 0
    return vocab

def text_to_tensor(text, vocab, max_len):
    tokenized = tokenize_text(text)
    tokenized = [vocab.get(word, vocab['<pad>']) for word in tokenized]
    return torch.tensor(pad_sequence(tokenized, max_len))

def tensor_to_text(tensor, vocab):
    reverse_vocab = {idx: word for word, idx in vocab.items()}
    return ' '.join([reverse_vocab.get(idx, '<unk>') for idx in tensor if idx != 0])

In [None]:
# ---------------------------
# Dataset
# ---------------------------
class QADataset(Dataset):
    def __init__(self, context_questions, answers, vocab, max_len=50):
        self.context_questions = context_questions
        self.answers = answers
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.context_questions)

    def __getitem__(self, idx):
        context_tensor = text_to_tensor(self.context_questions[idx][0], self.vocab, self.max_len)
        question_tensor = text_to_tensor(self.context_questions[idx][1], self.vocab, self.max_len)
        answer_tensor = text_to_tensor(self.answers[idx], self.vocab, self.max_len)
        return context_tensor, question_tensor, answer_tensor

In [None]:
# ---------------------------
# Basic TransformerQA (Original)
# ---------------------------
class TransformerQA(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_size, dropout=0.1):
        super(TransformerQA, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_size, dropout),
            num_layers
        )
        self.decoder = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)
        memory = self.encoder(src_emb)
        output = self.decoder(tgt_emb)
        return output

In [None]:
# ---------------------------
# Encoder-Decoder Transformer (New)
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2) * -(math.log(10000.0) / embed_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerQAModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_size, dropout=0.1, max_len=512):
        super(TransformerQAModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = PositionalEncoding(embed_size, max_len)
        self.pos_decoder = PositionalEncoding(embed_size, max_len)

        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_size, dropout)
        decoder_layer = nn.TransformerDecoderLayer(embed_size, num_heads, hidden_size, dropout)

        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.output_layer = nn.Linear(embed_size, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(next(self.parameters()).device)

    def forward(self, src, tgt):
        src_mask = None
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))

        src_emb = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src_emb = self.pos_encoder(src_emb)

        tgt_emb = self.embedding(tgt) * math.sqrt(self.embedding.embedding_dim)
        tgt_emb = self.pos_decoder(tgt_emb)

        memory = self.encoder(src_emb.transpose(0, 1), src_key_padding_mask=(src == 0))
        output = self.decoder(tgt_emb.transpose(0, 1), memory,
                              tgt_mask=tgt_mask,
                              tgt_key_padding_mask=(tgt == 0),
                              memory_key_padding_mask=(src == 0))

        return self.output_layer(output.transpose(0, 1))

In [None]:
# ---------------------------
# Train / Eval
# ---------------------------
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for context, question, answer in tqdm(train_loader, desc="Training"):
        context, question, answer = context.to(device), question.to(device), answer.to(device)
        optimizer.zero_grad()
        output = model(context, question)
        loss = criterion(output.view(-1, output.shape[-1]), answer.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, device, vocab):
    model.eval()
    bleu_scores, meteor_scores, rouge_scores = [], [], []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    with torch.no_grad():
        for context, question, answer in tqdm(val_loader, desc="Evaluating"):
            context, question, answer = context.to(device), question.to(device), answer.to(device)
            output = model(context, question)
            pred = output.argmax(dim=-1)
            pred_text = tensor_to_text(pred[0].cpu().numpy(), vocab)
            answer_text = tensor_to_text(answer[0].cpu().numpy(), vocab)
            bleu_scores.append(sentence_bleu([answer_text.split()], pred_text.split()))

            # Tokenize the answer_text before passing it to meteor_score
            meteor_scores.append(meteor_score([answer_text.split()], pred_text.split()))

            rouge_scores.append(scorer.score(answer_text, pred_text)['rougeL'].fmeasure)

    print(f"\nEvaluation Metrics:")
    print(f"  BLEU Score   : {np.mean(bleu_scores):.4f}")
    print(f"  METEOR Score : {np.mean(meteor_scores):.4f}")
    print(f"  ROUGE-L F1   : {np.mean(rouge_scores):.4f}")

In [None]:
# ---------------------------
# Save / Load
# ---------------------------
def save_model(model, filename):
    torch.save(model.state_dict(), filename)
    print(f"Model saved to {filename}")

def load_model(model, filename, device):
    model.load_state_dict(torch.load(filename, map_location=device))
    model.to(device)
    print(f"Model loaded from {filename}")

In [None]:

# ---------------------------
# Interactive QA
# ---------------------------
def interactive_qa(model, vocab, device, max_len=50):
    model.eval()
    reverse_vocab = {idx: word for word, idx in vocab.items()}
    print("Interactive QA mode. Type 'exit' to quit.")

    while True:
        context = input("\nEnter context (or 'exit'): ")
        if context.lower() == 'exit': break
        question = input("Enter question: ")
        if question.lower() == 'exit': break

        src = text_to_tensor(context, vocab, max_len).unsqueeze(0).to(device)
        tgt = text_to_tensor(question, vocab, max_len).unsqueeze(0).to(device)

        if isinstance(model, TransformerQAModel):
            output_ids = [vocab.get('<pad>', 0)]
            for _ in range(max_len):
                tgt_input = torch.tensor([output_ids], device=device)
                out = model(src, tgt_input)
                next_token = out[0, -1].argmax().item()
                if next_token == 0:
                    break
                output_ids.append(next_token)
            answer_text = ' '.join([reverse_vocab.get(i, '<unk>') for i in output_ids if i != 0])
        else:
            with torch.no_grad():
                out = model(src, tgt)
                pred = out.argmax(dim=-1)[0]
                answer_text = tensor_to_text(pred.cpu().numpy(), vocab)

        print(f"\nGenerated Answer: {answer_text}")

# ---------------------------
# Main
# ---------------------------
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with open("space_qa_dataset_large.json", "r") as f:
        full_data = json.load(f)

    context_questions = []
    answers = []
    for item in full_data["data"]:
        for paragraph in item["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answer = qa["answers"][0]["text"]
                context_questions.append((context, question))
                answers.append(answer)

    vocab = create_vocab([ctx + " " + q + " " + ans for (ctx, q), ans in zip(context_questions, answers)])
    dataset = QADataset(context_questions, answers, vocab, max_len)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)

    # === SELECT MODEL HERE ===
    model = TransformerQAModel(len(vocab), embed_size, num_heads, num_layers, hidden_size, dropout, max_len).to(device)
    # model = TransformerQA(len(vocab), embed_size, num_heads, num_layers, hidden_size, dropout).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        train_loss = train(model, train_loader, optimizer, criterion, device)
        print(f"Train Loss: {train_loss:.4f}")
        evaluate(model, val_loader, device, vocab)

    save_model(model, "transformer_qa.pth")

    # Optional: interactive_qa(model, vocab, device)




Epoch 1/10


Training: 100%|██████████| 13/13 [00:02<00:00,  4.77it/s]


Train Loss: 4.2724


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating: 100%|██████████| 50/50 [00:04<00:00, 10.44it/s]



Evaluation Metrics:
  BLEU Score   : 0.0045
  METEOR Score : 0.1048
  ROUGE-L F1   : 0.0617

Epoch 2/10


Training: 100%|██████████| 13/13 [00:02<00:00,  5.32it/s]


Train Loss: 2.3930


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 93.21it/s]



Evaluation Metrics:
  BLEU Score   : 0.0146
  METEOR Score : 0.1555
  ROUGE-L F1   : 0.0872

Epoch 3/10


Training: 100%|██████████| 13/13 [00:02<00:00,  5.38it/s]


Train Loss: 1.5384


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 96.85it/s]



Evaluation Metrics:
  BLEU Score   : 0.0192
  METEOR Score : 0.1698
  ROUGE-L F1   : 0.1442

Epoch 4/10


Training: 100%|██████████| 13/13 [00:02<00:00,  4.58it/s]


Train Loss: 1.1258


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 61.46it/s]



Evaluation Metrics:
  BLEU Score   : 0.0204
  METEOR Score : 0.1841
  ROUGE-L F1   : 0.0818

Epoch 5/10


Training: 100%|██████████| 13/13 [00:02<00:00,  4.87it/s]


Train Loss: 0.9719


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 96.61it/s]



Evaluation Metrics:
  BLEU Score   : 0.0176
  METEOR Score : 0.1770
  ROUGE-L F1   : 0.0775

Epoch 6/10


Training: 100%|██████████| 13/13 [00:02<00:00,  5.21it/s]


Train Loss: 0.9277


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 92.95it/s]



Evaluation Metrics:
  BLEU Score   : 0.0184
  METEOR Score : 0.1714
  ROUGE-L F1   : 0.1054

Epoch 7/10


Training: 100%|██████████| 13/13 [00:02<00:00,  5.17it/s]


Train Loss: 0.8958


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 94.31it/s]



Evaluation Metrics:
  BLEU Score   : 0.0189
  METEOR Score : 0.1727
  ROUGE-L F1   : 0.0761

Epoch 8/10


Training: 100%|██████████| 13/13 [00:02<00:00,  4.37it/s]


Train Loss: 0.8536


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 60.02it/s]



Evaluation Metrics:
  BLEU Score   : 0.0194
  METEOR Score : 0.1875
  ROUGE-L F1   : 0.0775

Epoch 9/10


Training: 100%|██████████| 13/13 [00:02<00:00,  4.94it/s]


Train Loss: 0.8569


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 95.15it/s]



Evaluation Metrics:
  BLEU Score   : 0.0191
  METEOR Score : 0.1782
  ROUGE-L F1   : 0.0784

Epoch 10/10


Training: 100%|██████████| 13/13 [00:03<00:00,  3.95it/s]


Train Loss: 0.8462


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 78.03it/s]


Evaluation Metrics:
  BLEU Score   : 0.0189
  METEOR Score : 0.1777
  ROUGE-L F1   : 0.0761
Model saved to transformer_qa.pth



