# 1. Load SQUAD Dataset
 

In [1]:
from datasets import load_dataset

squad = load_dataset('squad', split='validation')

print(squad)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})


# 2. Imports for Base Model

In [2]:
import torch
from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
import numpy as np
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from collections import Counter

# 3. Evaluate Base Model

In [5]:
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from datasets import load_dataset
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load the SentenceTransformer model for retrieval
retrieval_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load the QA pipeline with the "deepset/roberta-base-squad2" model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)

# Load the SQuAD dataset
squad = load_dataset('squad', split='validation')
squad = squad.select(range(100))

# Prepare storage for predictions and ground truth
predictions = []
references = []

# Evaluate the model on the SQuAD dataset
for example in tqdm(squad):
    question = example['question']
    context = example['context']
    reference_answer = example['answers']['text'][0]

    # Step 1: Retrieve relevant passages using SentenceTransformer
    query_embedding = retrieval_model.encode(question, convert_to_tensor=True)
    context_embedding = retrieval_model.encode(context, convert_to_tensor=True)

    # Compute cosine similarity between question and context
    similarity_score = util.pytorch_cos_sim(query_embedding, context_embedding).item()

    # Step 2: Use the QA pipeline to predict the answer
    if similarity_score > 0.5:  # Threshold to determine if context is relevant
        result = qa_pipeline({'question': question, 'context': context})
        predicted_answer = result['answer']
    else:
        predicted_answer = ""

    predictions.append(predicted_answer)
    references.append(reference_answer)

# Evaluation metrics computation (Accuracy, F1 Score, Precision, Recall, Jaccard Similarity, ROUGE Score, BLEU)
def compute_metrics(predictions, references):
    # Exact Match
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    accuracy = sum(exact_matches) / len(exact_matches)

    # Precision and Recall
    precision = precision_score(references, predictions, average='macro', zero_division=0)
    recall = recall_score(references, predictions, average='macro', zero_division=0)
    
    # F1 Score
    f1 = f1_score(references, predictions, average='macro')


    # BLEU Score with smoothing
    smoothing_function = SmoothingFunction().method1
    bleu_scores = []
    for ref, pred in zip(references, predictions):
        reference = [ref.split()]  # BLEU expects a list of references
        candidate = pred.split()
        bleu = sentence_bleu(reference, candidate, weights=(0.5, 0.5), smoothing_function=smoothing_function)
        bleu_scores.append(bleu)
    avg_bleu = np.mean(bleu_scores)

    # ROUGE Score
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge_scorer_instance.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    return accuracy, f1, precision, recall, avg_bleu, (rouge1, rouge2, rougeL)

# Compute metrics
accuracy, f1, precision, recall, avg_bleu, rouge_scores = compute_metrics(predictions, references)

# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'BLEU Score: {avg_bleu}')
print(f'ROUGE-L: {rouge_scores[2]:.4f}')

100%|█████████████████████████████████████████| 100/100 [00:02<00:00, 43.25it/s]

Accuracy: 60.00%
F1 Score: 0.38
Precision: 0.41
Recall: 0.38
BLEU Score: 0.4937353411859368
ROUGE-L: 0.6577





# 4. Supplemental Imports for Hybrid Retrieval Model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import faiss

# 5. Evaluate Hybrid Retrieval Model

In [14]:
# Load the SQuAD dataset
squad = load_dataset('squad', split='validation')
squad = squad.select(range(100))

# SentenceTransformer model for embedding
retrieval_model = SentenceTransformer("all-MiniLM-L6-v2")

documents = squad['context']
document_embeddings = retrieval_model.encode(documents, show_progress_bar=True)

dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)

# TF-IDF for Sparse Retrieval
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)

# Hybrid concatenation mechanism
def hybrid_rag_answer(question, k=3, alpha=0.7, max_context_length=512):
    # Dense Retrieval
    question_embedding = retrieval_model.encode([question])
    distances, indices = index.search(question_embedding, k)
    dense_scores = 1 / (1 + distances)  # Convert distances to similarity scores
    
    # Sparse Retrieval
    query_tfidf = tfidf_vectorizer.transform([question])
    sparse_scores = (query_tfidf @ tfidf_matrix.T).toarray().flatten()
    
    # Combine Dense and Sparse Scores for the top k vector score retrievals
    sparse_scores_k = sparse_scores[indices.flatten()]
    combined_scores = alpha * dense_scores.flatten() + (1 - alpha) * sparse_scores_k
    
    # Retrieve the top k similar documents
    top_k_indices = indices.flatten()[np.argsort(combined_scores)[-k:][::-1]]
    retrieved_docs = [documents[i] for i in top_k_indices]
    
    context = " ".join(retrieved_docs)
    if len(context.split()) > max_context_length:
        context = " ".join(context.split()[:max_context_length])
    
    result = qa_pipeline(question=question, context=context)
    
    return result['answer']

predictions = []
references = []

# Evaluation metrics computation (Accuracy, F1 Score, Precision, Recall, MRR, Jaccard Similarity, ROUGE Score)
def compute_metrics(predictions, references):
    # Exact Match
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    accuracy = sum(exact_matches) / len(exact_matches)

    # Precision and Recall
    precision = precision_score(references, predictions, average='macro', zero_division=0)
    recall = recall_score(references, predictions, average='macro', zero_division=0)
    
    # F1 Score
    f1 = f1_score(references, predictions, average='macro')

    # Mean Reciprocal Rank (MRR)
    reciprocal_ranks = []
    for ref, pred in zip(references, predictions):
        if pred in ref:
            reciprocal_ranks.append(1 / (references.index(ref) + 1))
        else:
            reciprocal_ranks.append(0)
    mrr = np.mean(reciprocal_ranks)

    # Jaccard Similarity
    jaccard_scores = []
    for ref, pred in zip(references, predictions):
        set_ref = set(ref.split())
        set_pred = set(pred.split())
        jaccard_score = len(set_ref.intersection(set_pred)) / len(set_ref.union(set_pred)) if len(set_ref.union(set_pred)) > 0 else 0
        jaccard_scores.append(jaccard_score)
    avg_jaccard = np.mean(jaccard_scores)

    # ROUGE Score
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge_scorer_instance.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    return accuracy, f1, precision, recall, mrr, avg_jaccard, (rouge1, rouge2, rougeL)


# Hyperparameter Tuning (alpha, documents)
best_alpha = 0.5
best_k = 5
best_accuracy = 0
best_f1 = 0

# Variables to hold the best metric scores
best_metrics = None

for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
    for k in [3, 5, 7, 9]:
        predictions = []
        references = []

        for example in tqdm(squad):
            question = example['question']
            reference_answer = example['answers']['text'][0]
            predicted_answer = hybrid_rag_answer(question, k=k, alpha=alpha)
            
            predictions.append(predicted_answer)
            references.append(reference_answer)

        accuracy, f1, precision, recall, mrr, avg_jaccard, rouge_scores = compute_metrics(predictions, references)
        
        if accuracy > best_accuracy:
            best_alpha = alpha
            best_k = k
            best_accuracy = accuracy
            best_f1 = f1
            best_metrics = (accuracy, f1, precision, recall, mrr, avg_jaccard, rouge_scores)

print(f'Best Alpha: {best_alpha}, Best k: {best_k}')
# Print only the best metrics
if best_metrics:
    best_accuracy, best_f1, best_precision, best_recall, best_mrr, best_avg_jaccard, best_rouge = best_metrics
    best_rouge1, best_rouge2, best_rougeL = best_rouge
    print(f'Accuracy: {best_accuracy * 100:.2f}%')
    print(f'F1 Score: {best_f1:.2f}')
    print(f'Precision: {best_precision:.2f}')
    print(f'Recall: {best_recall:.2f}')
    print(f'ROUGE-L: {best_rougeL:.2f}')


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

100%|█████████████████████████████████████████| 100/100 [00:03<00:00, 32.17it/s]
100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 22.76it/s]
100%|█████████████████████████████████████████| 100/100 [00:08<00:00, 12.35it/s]
100%|█████████████████████████████████████████| 100/100 [00:06<00:00, 14.74it/s]
100%|█████████████████████████████████████████| 100/100 [00:03<00:00, 33.12it/s]
100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 23.30it/s]
100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 20.68it/s]
100%|█████████████████████████████████████████| 100/100 [00:05<00:00, 19.10it/s]
100%|█████████████████████████████████████████| 100/100 [00:03<00:00, 32.70it/s]
100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 23.21it/s]
100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 20.59it/s]
100%|█████████████████████████████████████████| 100/100 [00:05<00:00, 18.84it/s]
100%|███████████████████████

Best Alpha: 0.1, Best k: 3
Accuracy: 64.00%
F1 Score: 0.42
Precision: 0.42
Recall: 0.47
ROUGE-L: 0.71



