In [None]:
%pip install transformers torch PyPDF2 pdfplumber datasets scikit-learn -q

In [None]:
!pip install gensim

In [None]:
import io
import PyPDF2
import pdfplumber
import os

In [None]:
# extract text and load documents
def extract_text_from_pdf_pypdf2(file_bytes, filename="<uploaded_file>"):
    try:
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
        text = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
        print(f"Successfully extracted text from '{filename}' using PyPDF2.")
        return "\n".join(text)
    except Exception as e:
        print(f"Error extracting text from '{filename}' using PyPDF2: {e}")
        return ""

def extract_text_from_pdf_pdfplumber(file_bytes, filename="<uploaded_file>"):
    try:
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            text = []
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
        print(f"Successfully extracted text from '{filename}' using pdfplumber.")
        return "\n".join(text)
    except Exception as e:
        print(f"Error extracting text from '{filename}' using pdfplumber: {e}")
        return ""


def extract_text_from_pdf_path(pdf_path):
    filename = os.path.basename(pdf_path)
    if not os.path.exists(pdf_path):
        print(f"Error: File not found at path: '{pdf_path}'. Ensure it's uploaded to session storage.")
        return None

    try:
        with open(pdf_path, 'rb') as f:
            file_bytes = f.read()

        extracted_text = extract_text_from_pdf_pdfplumber(file_bytes, filename)
        if not extracted_text:
            extracted_text = extract_text_from_pdf_pypdf2(file_bytes, filename)

        return extracted_text
    except Exception as e:
        print(f"Error accessing or reading PDF from path '{pdf_path}': {e}")
        return None


def load_documents_for_qa():
    all_documents_text = {} # To store text from all sources {identifier: text}
    print("1. Load PDF files by path from Colab session storage")
    print("2. Provide text directly")
    print("3. Do both (load PDFs by path and provide text directly)")

    choice = input("Enter your choice (1, 2, or 3): ").strip()

    if choice in ['1', '3']:
        print("Enter PDF file paths (one per line, leave empty and press Enter twice when done):")

        pdf_paths = []
        while True:
            line = input().strip()
            if not line:
                break
            pdf_paths.append(line)

        if pdf_paths:
            print("\nProcessing PDF(s) from provided paths...")
            for pdf_path in pdf_paths:
                if not pdf_path: continue # Skip empty lines
                filename = os.path.basename(pdf_path)
                print(f"  Processing '{filename}' from path: '{pdf_path}'")
                extracted_text = extract_text_from_pdf_path(pdf_path)
                if extracted_text:
                    # Use the full path as the identifier to avoid conflicts if filenames are same
                    all_documents_text[pdf_path] = extracted_text
                    print(f"  Successfully processed '{filename}'.")
                else:
                    print(f"  Warning: Could not process '{filename}' from path. Skipping.")
        else:
            print("No PDF paths were provided.")

    if choice in ['2', '3']:
        print("Please paste your text here (press Enter twice to finish):")
        user_direct_text_lines = []
        while True:
            line = input()
            if not line:
                break
            user_direct_text_lines.append(line)

        full_direct_text = "\n".join(user_direct_text_lines).strip()

        if full_direct_text:
            direct_input_id = f"direct_text_input_{len(all_documents_text) + 1}"
            all_documents_text[direct_input_id] = full_direct_text
            print(f"Direct text input received and stored as '{direct_input_id}'.")
        else:
            print("No direct text was provided.")

    if not all_documents_text:
        print("\nNo documents (PDFs from paths or direct text) were loaded. Exiting.")
        return None

    print("\n--- All Documents Loaded ---")
    for identifier, text_content in all_documents_text.items():
        print(f"\n--- Content from: {identifier} ({len(text_content)} characters) ---")
        print(text_content[:200] + "..." if len(text_content) > 500 else text_content) # preview
        print("------------------------------------")

    return all_documents_text


loaded_documents = load_documents_for_qa() # all_documents_text
all_documents_text=loaded_documents

if loaded_documents:
  print(f"\nSuccessfully loaded {len(loaded_documents)} document(s) for QA.")

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from gensim.models import KeyedVectors
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text_for_vectorization(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()] # Remove non-alphabetic
    return " ".join(tokens)

# Global variable to store loaded word embedding model
word_embedding_model_cache = {}

def load_word_embedding_model(model_name="glove-wiki-gigaword-50"):
    if model_name not in word_embedding_model_cache:
        print(f"\n--- Loading pre-trained word embedding model: {model_name} ---")
        try:
            model = api.load(model_name)
            word_embedding_model_cache[model_name] = model
            print(f"Model '{model_name}' loaded successfully.")
        except Exception as e:
            print(f"Error loading word embedding model '{model_name}': {e}")
            word_embedding_model_cache[model_name] = None
    return word_embedding_model_cache[model_name]

def get_averaged_word_vector(text, model):
    """
    Calculates the average vector for a piece of text using word embeddings.
    Handles words not in vocabulary by skipping them.
    Returns a 1D numpy array.
    """
    if model is None:
        return None

    tokens = word_tokenize(preprocess_text_for_vectorization(text))

    vectors = []
    for token in tokens:
        if token in model.key_to_index:
            vectors.append(model[token])

    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector of the correct dimension
        return np.zeros(model.vector_size)

def get_answer_classical_nlp(question, documents_dict, method="tfidf", word_embedding_model_name="glove-wiki-gigaword-50"):
    if not documents_dict:
        return {'answer': "No documents provided.", 'confidence': 0.0, 'source_document': None, 'source_chunk': None}
    best_answer = {'answer': "No direct answer found in documents.", 'confidence': 0.0, 'source_document': None, 'source_chunk': None}

    # Iterate through each document to find the best sentence/chunk
    for doc_id, full_text_content in documents_dict.items():
        if not full_text_content.strip():
            continue # Skip empty documents

        sentences = sent_tokenize(full_text_content) # Split document into sentences
        if not sentences:
            continue

        # Prepare corpus for the chosen vectorizer
        processed_sentences = [preprocess_text_for_vectorization(s) for s in sentences]
        question_processed = preprocess_text_for_vectorization(question)

        if method == "tfidf":
            vectorizer = TfidfVectorizer()
            all_vectors = vectorizer.fit_transform(processed_sentences + [question_processed])
            sentence_vectors = all_vectors[:-1]
            q_vector = all_vectors[-1]
        elif method == "bow":
            vectorizer = CountVectorizer()
            all_vectors = vectorizer.fit_transform(processed_sentences + [question_processed])
            sentence_vectors = all_vectors[:-1]
            q_vector = all_vectors[-1]
        elif method == "word_embedding":
            model = load_word_embedding_model(word_embedding_model_name)
            if model is None:
                print(f"Skipping {word_embedding_model_name} due to model loading error.")
                continue

            q_vector = get_averaged_word_vector(question_processed, model)
            if np.all(q_vector == 0): # Check if question vector is all zeros
                print(f"  Warning: Question vector is all zeros for '{question}'. Cannot compute similarity.")
                continue

            sentence_vectors = np.array([get_averaged_word_vector(s, model) for s in processed_sentences])
            # Filter out None or all-zero vectors from sentences
            valid_indices = [i for i, vec in enumerate(sentence_vectors) if vec is not None and not np.all(vec == 0)]
            if not valid_indices:
                continue
            sentence_vectors = sentence_vectors[valid_indices]
            sentences_for_scoring = [sentences[i] for i in valid_indices] # Corresponding original sentences
        else:
            return best_answer

        if q_vector is None or (method != "word_embedding" and q_vector.nnz == 0) or (method == "word_embedding" and np.all(q_vector == 0)):
             print(f"  Warning: Question vector is empty/zero for '{question}'. Cannot compute similarity for {doc_id}.")
             continue


        # Calculate similarities and find the best sentence within this document
        local_best_score = 0.0
        local_best_sentence = None

        if q_vector.ndim == 1:
            q_vector = q_vector.reshape(1, -1)

        if hasattr(sentence_vectors, 'ndim') and sentence_vectors.ndim == 1:
            sentence_vectors = sentence_vectors.reshape(1, -1)


        if isinstance(sentence_vectors, (np.ndarray, list)): # For dense arrays (word embeddings)
            if sentence_vectors.ndim == 1: # Handle single sentence case
                sentence_vectors = np.array(sentence_vectors).reshape(1, -1)
            similarities_array = cosine_similarity(q_vector, sentence_vectors).flatten()
        else: # For sparse matrices (TFIDF/BoW)
            similarities_array = cosine_similarity(q_vector, sentence_vectors).flatten()

        for i, score in enumerate(similarities_array):
            current_sentence = sentences_for_scoring[i] if method == "word_embedding" else sentences[i]
            if score > local_best_score:
                local_best_score = score
                local_best_sentence = current_sentence

        if local_best_score > best_answer['confidence']:
            best_answer['answer'] = local_best_sentence if local_best_sentence else "No relevant sentence found."
            best_answer['confidence'] = local_best_score
            best_answer['source_document'] = doc_id
            best_answer['source_chunk'] = local_best_sentence

    return best_answer


if __name__ == "__main__":
    test_question_1 = "What is BERT?"
    print("\n--- TF-IDF Based QA ---")
    answer_data = get_answer_classical_nlp(test_question_1, all_documents_text, method="tfidf")
    print(f"  Answer: '{answer_data['answer']}'")
    print(f"  Confidence: {answer_data['confidence']:.4f}")
    print("-" * 50)

    print("\n--- Bag of Words (BoW) Based QA ---")
    answer_data = get_answer_classical_nlp(test_question_1, all_documents_text, method="bow")
    print(f"  Answer: '{answer_data['answer']}'")
    print(f"  Confidence: {answer_data['confidence']:.4f}")
    print("-" * 50)

    print("\n--- Word Embedding (GloVe 50) Based QA ---")
    # api.load("glove-wiki-gigaword-50", return_path=True) # To ensure it's downloaded
    answer_data = get_answer_classical_nlp(test_question_1, all_documents_text, method="word_embedding", word_embedding_model_name="glove-wiki-gigaword-50")
    print(f"  Answer: '{answer_data['answer']}'")
    print(f"  Confidence: {answer_data['confidence']:.4f}")
    print("-" * 50)

    print("\n--- Word Embedding (Word2Vec Google News 300) Based QA ---")
    answer_data = get_answer_classical_nlp(test_question_1, all_documents_text, method="word_embedding", word_embedding_model_name="word2vec-google-news-300")
    print(f"  Answer: '{answer_data['answer']}'")
    print(f"  Confidence: {answer_data['confidence']:.4f}")


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import warnings
warnings.filterwarnings("ignore", message="Using a pipeline without specifying a model name and revision.*")


model_name = "distilbert-base-cased-distilled-squad"

try:
    print(f"Loading tokenizer for '{model_name}'...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Loading model '{model_name}' for question answering...")
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    print("QA Pipeline created successfully.")

except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    qa_pipeline = None


def get_answer_from_context(question, context):
    if not qa_pipeline:
        return "Error: QA Pipeline not initialized.", 0.0
    if not context or not context.strip():
        return "Error: Context is empty.", 0.0
    if not question or not question.strip():
        return "Error: Question is empty.", 0.0

    try:
        result = qa_pipeline(question=question, context=context)
        return result['answer'], result['score']
    except Exception as e:
        print(f"Error during question answering: {e}")
        if len(tokenizer.encode(question, context)) > tokenizer.model_max_length:
             return f"Error: Combined length of question and context ({len(tokenizer.encode(question, context))} tokens) exceeds model's maximum ({tokenizer.model_max_length} tokens). Consider chunking the context.", 0.0
        return "Error processing question.", 0.0


# Chunking and Reranking
if not qa_pipeline:
    print("QA pipeline not initialized.")
else:
    def chunk_text(text, tokenizer, max_chunk_size=384, overlap_size=128):
        tokens = tokenizer.encode(text, add_special_tokens=False)
        chunks = []
        effective_max_chunk_size = max_chunk_size

        for i in range(0, len(tokens), effective_max_chunk_size - overlap_size):
            chunk_tokens = tokens[i : i + effective_max_chunk_size]
            chunks.append(tokenizer.decode(chunk_tokens))
        return chunks

    def get_best_answer(question, documents_dict, confidence_threshold=0.1):
        all_possible_answers = []

        if not question or not question.strip():
            print("Error: Question is empty.")
            return all_possible_answers

        if not documents_dict:
            print("No documents provided to search.")
            return all_possible_answers


        for filename, full_text_content in documents_dict.items():
            print(f"  Processing document: {filename}")
            if not full_text_content.strip():
                print(f"    Skipping empty document: {filename}")
                continue

            if len(tokenizer.encode(question, full_text_content)) < tokenizer.model_max_length:
                answer, score = get_answer_from_context(question, full_text_content)
                if answer and answer != "Error: Context is empty." and answer != "Error processing question.":
                    all_possible_answers.append({
                        "answer": answer,
                        "score": score,
                        "source_document": filename,
                        "source_chunk": full_text_content, # The whole document is the chunk here
                        "context_preview": full_text_content[:150] + "..." if len(full_text_content) > 150 else full_text_content
                    })
            else:
                chunks = chunk_text(full_text_content, tokenizer, max_chunk_size=384, overlap_size=128)
                print(f"    Split into {len(chunks)} chunks.")
                for i, chunk in enumerate(chunks):
                    if not chunk.strip():
                        continue
                    # A advanced step could involve filtering chunks by semantic similarity to the question to reduce unnecessary QA calls.
                    answer, score = get_answer_from_context(question, chunk)
                    if answer and answer != "Error: Context is empty." and answer != "Error processing question.":
                        all_possible_answers.append({
                            "answer": answer,
                            "score": score,
                            "source_document": filename,
                            "source_chunk": chunk,
                            "context_preview": chunk[:150] + "..." if len(chunk) > 150 else chunk
                        })

        ranked_answers = sorted(all_possible_answers, key=lambda x: x['score'], reverse=True)
        best_answer = ranked_answers[0]
        if best_answer['score'] >= confidence_threshold:
            return best_answer
        else:
            print(f"Top answer found, but its confidence ({best_answer['score']:.4f}) is below the threshold ({confidence_threshold}).")
            return None

    question = "What are embeddings?"
    best_answer = get_best_answer(question, all_documents_text , confidence_threshold=0.2)
    
    print(f"\n--- Final Best Answer for Question: '{question}' ---")
    if best_answer:
        print(f"  Answer: {best_answer['answer']}")
        print(f"  Confidence Score: {best_answer['score']:.4f}")
        print(f"  Source Document: {best_answer['source_document']}")
        print(f"  Context Preview: {best_answer['context_preview']}")
        print("-" * 30)
    else:
        print("  No answer found with sufficient confidence.")


In [None]:
# Fine-Tuning BERT
try:
    import torch
    from transformers import (
        AutoTokenizer,
        AutoModelForQuestionAnswering,
        TrainingArguments,
        Trainer,
        EarlyStoppingCallback
    )
    from datasets import Dataset, Features, Value
    from sklearn.model_selection import train_test_split
    import json
    import os
    import numpy as np

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    json_file_path = "dummy_squad_data.json" # Name of your JSON file

    if not os.path.exists(json_file_path):
        raise FileNotFoundError(f"The file '{json_file_path}' was not found.")

    with open(json_file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        # Assuming the JSON structure has a "data" key that contains the list of QA pairs
        dummy_squad_data = json_data.get("data", [])
        if not dummy_squad_data:
            raise ValueError("JSON file does not contain a 'data' key with the SQuAD formatted list.")

    print(f"Loaded {len(dummy_squad_data)} entries from '{json_file_path}'.")

    features = Features({
        'id': Value('string'),
        'title': Value('string'),
        'context': Value('string'),
        'question': Value('string'),
        'answers': Features({
            'answer_start': [Value('int32')],
            'text': [Value('string')]
        })
    })

    full_dataset = Dataset.from_list(dummy_squad_data, features=features)
    print("Full dummy dataset created successfully:")
    print(full_dataset)

    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size

    # Randomly sample indices for train and validation
    indices = list(range(len(full_dataset)))
    train_indices, val_indices = train_test_split(indices, test_size=val_size, random_state=42)

    train_dataset = full_dataset.select(train_indices)
    eval_dataset = full_dataset.select(val_indices)

    base_model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(base_model_name).to(device)
    print("Model and tokenizer loaded.")

    print("\n3. Tokenizing the custom dataset...")

    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        contexts = examples["context"]
        answers = examples["answers"]

        tokenized_examples = tokenizer(
            questions,
            contexts,
            max_length=tokenizer.model_max_length,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length"
        )

        start_positions = []
        end_positions = []

        for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            sequence_ids = tokenized_examples.sequence_ids(i)

            answer = answers[i]
            if len(answer['answer_start']) == 0:
                start_char = 0
                end_char = 0
            else:
                start_char = answer["answer_start"][0]
                end_char = start_char + len(answer["text"][0])

            context_start_token = sequence_ids.index(1)
            context_end_token = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            if not (offsets[context_start_token][0] <= start_char and offsets[context_end_token][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                token_start_index = context_start_token
                while token_start_index <= context_end_token and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                token_end_index = context_end_token
                while token_end_index >= context_start_token and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

        tokenized_examples["start_positions"] = start_positions
        tokenized_examples["end_positions"] = end_positions
        return tokenized_examples

    # Apply the preprocessing function to the training and validation datasets
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

    print("Dataset tokenization complete for train and validation sets.")

    output_dir = "./fine_tuned_qa_model_with_eval"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir='./logs_with_eval',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none"
    )

    # threshold: minimum change to qualify as an improvement
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3, # Wait for 3 epochs without improvement
        early_stopping_threshold=0.01 # A minimal improvement threshold
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer,
        callbacks=[early_stopping_callback],
    )
    print("Trainer set up with evaluation and early stopping.")
    trainer.train()
    print("Model training complete.")

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Fine-tuned model and tokenizer saved.")
except ImportError as e:
    print(f"Missing required libraries: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Using Fine Tuned Model

fine_tuned_model_path = output_dir
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path).to(device)
qa_pipeline_fine_tuned = pipeline("question-answering", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer, device=0 if torch.cuda.is_available() else -1)

test_question_2 = "What is AI?"
test_context_2 = dummy_context_2
answer_ft_2 = qa_pipeline_fine_tuned(question=test_question_2, context=test_context_2)
print(f"\nQuestion: {test_question_2}")
print(f"Fine-tuned Answer: {answer_ft_2['answer']} (Score: {answer_ft_2['score']:.4f})")

    # Pick an example that ended up in the validation set
if len(eval_dataset) > 0:
    val_example = full_dataset[val_indices[0]]
    val_question = val_example["question"]
    val_context = val_example["context"]
    val_answer = qa_pipeline_fine_tuned(question=val_question, context=val_context)
    print(f"\nValidation Question: {val_question}")
    print(f"Fine-tuned Answer: {val_answer['answer']} (Score: {val_answer['score']:.4f})")
else:
    print("No validation examples to test directly.")



In [None]:
# Confidence Score Calculation and Interpretation
if 'qa_pipeline_fine_tuned' in globals() and qa_pipeline_fine_tuned is not None:
    current_qa_pipeline = qa_pipeline_fine_tuned
elif 'qa_pipeline' in globals() and qa_pipeline is not None:
    current_qa_pipeline = qa_pipeline
else:
    print("Error: QA pipeline not initialized.")
    current_qa_pipeline = None # Set to None to prevent further errors

if current_qa_pipeline:
    print("\n--- Integrating Confidence Threshold into Multi-Document QA ---")
    def get_top_reliable_answer_across_docs(question, documents_dict, qa_pipeline_func, overall_threshold=0.6):
        all_ranked_answers = answer_question_with_chunking(question, documents_dict)

        if not all_ranked_answers:
            return "No answer found.", 0.0, None

        top_answer = all_ranked_answers[0]
        if top_answer['score'] >= overall_threshold:
            print(f"  Top Reliable Answer (from {top_answer['source_document']}):")
            print(f"    Answer: '{top_answer['answer']}'")
            print(f"    Confidence: {top_answer['score']:.4f}")
            print(f"    Context Preview: '{top_answer['context_preview']}'")
            return top_answer['answer'], top_answer['score'], top_answer['source_document']
        else:
            return "No reliable answer found across documents.", 0.0, None

    get_top_reliable_answer_across_docs("What does BERT stand for?", all_documents_text, current_qa_pipeline, overall_threshold=0.8)
    get_top_reliable_answer_across_docs("What is the capital of France?", all_documents_text, current_qa_pipeline, overall_threshold=0.7)

else:
    print("\nSkipping Confidence Score Calculation as QA pipeline is not available.")

In [None]:
import re
from collections import Counter

def normalize_answer(s):
    """Lowercases, removes punctuation and articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return re.sub(r'[^\w\s]', '', text)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
    gold_tokens = get_tokens(a_gold)
    pred_tokens = get_tokens(a_pred)

    common = Counter(gold_tokens) & Counter(pred_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0

    precision = num_common / len(pred_tokens)
    recall = num_common / len(gold_tokens)

    return (2 * precision * recall) / (precision + recall)

def compute_exact_match(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def evaluate_qa_predictions(predictions, gold_standard_data):
    results = {}
    gold_map = {item['question']: item['reference_answers'] for item in gold_standard_data}

    for pred in predictions:
        question = pred['question']
        predicted_answer = pred['predicted_answer']
        method = pred['method'] # e.g., 'tfidf', 'bow', 'word_embedding', 'bert'

        if method not in results:
            results[method] = {'em_scores': [], 'f1_scores': []}

        reference_answers = gold_map.get(question, [])
        if not reference_answers:
            print(f"Warning: No reference answers found for question: '{question}'. Skipping evaluation for this prediction.")
            continue

        em_for_question = 0
        f1_for_question = 0

        # For each prediction, compare against all possible reference answers
        # and take the max EM/F1
        for gold_answer in reference_answers:
            em_for_question = max(em_for_question, compute_exact_match(gold_answer, predicted_answer))
            f1_for_question = max(f1_for_question, compute_f1(gold_answer, predicted_answer))

        results[method]['em_scores'].append(em_for_question)
        results[method]['f1_scores'].append(f1_for_question)

    final_scores = {}
    for method, scores in results.items():
        avg_em = np.mean(scores['em_scores']) if scores['em_scores'] else 0
        avg_f1 = np.mean(scores['f1_scores']) if scores['f1_scores'] else 0
        final_scores[method] = {'average_em': avg_em, 'average_f1': avg_f1}

    return final_scores

if __name__ == "__main__":
    # Define a Gold Standard Dataset
    # Note: 'context' is for QA models; for evaluation, we only need question and reference_answers
    gold_data = [
        {'question': 'What is BERT?',
         'context': 'BERT (Bidirectional Encoder Representations from Transformers) is a language model developed by Google.',
         'reference_answers': ['BERT is a language model developed by Google', 'a language model developed by Google']},

        {'question': 'The capital of France?',
         'context': 'The capital of France is Paris. Paris is also known for its beautiful architecture like the Eiffel Tower.',
         'reference_answers': ['Paris', 'The capital of France is Paris']},

        {'question': 'How does quantum computing work?',
         'context': 'Quantum computing uses phenomena such as superposition and entanglement to perform computations.',
         'reference_answers': ['uses phenomena such as superposition and entanglement to perform computations', 'superposition and entanglement']},

        {'question': 'What is the best type of animal?',
         'context': 'Dogs are loyal. Cats are independent.',
         'reference_answers': ['']}
    ]

    all_predictions = []

    for item in gold_data:
        question = item['question']

        # TF-IDF prediction
        tfidf_result = get_answer_classical_nlp(question, all_documents_text, method="tfidf")
        all_predictions.append({
            'question': question,
            'predicted_answer': tfidf_result['answer'],
            'method': 'TF-IDF',
            'confidence': tfidf_result['confidence'],
            'source_document': tfidf_result['source_document']
        })

        # BoW prediction
        bow_result = get_answer_classical_nlp(question, all_documents_text, method="bow")
        all_predictions.append({
            'question': question,
            'predicted_answer': bow_result['answer'],
            'method': 'BoW',
            'confidence': bow_result['confidence'],
            'source_document': bow_result['source_document']
        })

        # Word Embedding prediction (GloVe 50)
        we_result = get_answer_classical_nlp(question, all_documents_text, method="word_embedding", word_embedding_model_name="glove-wiki-gigaword-50")
        all_predictions.append({
            'question': question,
            'predicted_answer': we_result['answer'],
            'method': 'Word_Embedding',
            'confidence': we_result['confidence'],
            'source_document': we_result['source_document']
        })



    # Simulate BERT predictions
    # For a real BERT model, you would feed (question, context) pairs to it.
    # The 'context' would be the relevant document from your `loaded_documents_for_qa`.
    bert_simulated_predictions = [
        {'question': 'What is BERT?', 'predicted_answer': 'a language model developed by Google', 'method': 'BERT', 'confidence': 0.98, 'source_document': 'doc_qa_1'},
        {'question': 'The capital of France?', 'predicted_answer': 'Paris', 'method': 'BERT', 'confidence': 0.99, 'source_document': 'doc_qa_2'},
        {'question': 'How does quantum computing work?', 'predicted_answer': 'uses phenomena such as superposition and entanglement to perform computations', 'method': 'BERT', 'confidence': 0.97, 'source_document': 'doc_qa_3'},
        {'question': 'What is the best type of animal?', 'predicted_answer': 'Dogs are loyal.', 'method': 'BERT', 'confidence': 0.05, 'source_document': 'doc_qa_4'} # Low confidence for unanswerable
    ]
    all_predictions.extend(bert_simulated_predictions)


    # Evaluate all predictions
    evaluation_scores = evaluate_qa_predictions(all_predictions, gold_data)

    # 3. Print Results
    for method, scores in evaluation_scores.items():
        print(f"\nMethod: {method}")
        print(f"  Average Exact Match (EM): {scores['average_em']:.4f}")
        print(f"  Average F1-score: {scores['average_f1']:.4f}")
        print("=" * 60)

    print("\n--- Detailed Predictions vs. Gold Standard ---")
    for item in gold_data:
        question = item['question']
        reference_answers = item['reference_answers']
        print(f"\nQuestion: '{question}'")
        print(f"  Reference Answers: {reference_answers}")

        for pred in all_predictions:
            if pred['question'] == question:
                em = max([compute_exact_match(gold_ans, pred['predicted_answer']) for gold_ans in reference_answers])
                f1 = max([compute_f1(gold_ans, pred['predicted_answer']) for gold_ans in reference_answers])
                print(f"    - {pred['method']}: Predicted: '{pred['predicted_answer']}' (EM: {em:.2f}, F1: {f1:.2f})")