In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install PyPDF2 tqdm sentence-transformers numpy faiss-cpu

import os
import json
from tqdm import tqdm
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [None]:
# -------- CONFIG --------

DATA_DIR = "/kaggle/input/291-rag-dataset/Main"
REQUESTS_PATH = "requests.json"
MANUAL_BASELINE_PATH = "manual_baseline.json"
RESULTS_PATH = "faiss_results.json"
TOP_K = 8

# -------- STEP 1: LOAD DOCUMENTS --------

def load_documents(data_dir):
    docs = []
    for file in tqdm(os.listdir(data_dir), desc="Loading documents"):
        file_path = os.path.join(data_dir, file)
        text = ""
        if file.endswith(".pdf"):
            try:
                reader = PdfReader(file_path)
                for page in reader.pages:
                    text += page.extract_text() or ""
            except Exception as e:
                print(f"Error reading {file}: {e}")
        elif file.endswith(".txt"):
            with open(file_path, "r", errors="ignore") as f:
                text = f.read()
        if len(text.strip()) > 200:  # Filter trivial data
            docs.append({"id": file, "text": text})
    return docs

documents = load_documents(DATA_DIR)
print(f"Loaded {len(documents)} valid documents.")

In [None]:
# -------- STEP 2: CREATE REQUEST SET --------

requests = [
"Which groups are considered at highest risk of severe influenza complications, and what medical or preventive interventions are recommended for them?",
"What are the major risk factors and early symptoms of myocardial infarction, and how does public awareness impact treatment outcomes?”,
"How can early diagnosis, evolving clinical definitions, and global prevention strategies improve sepsis outcomes and reduce mortality worldwide?",
"Based on the collective perspective of recent literature, what are the major biological, technological, and sociopolitical barriers that still hinder full malaria eradication despite modern advances?",
"What evidence exists across sources that mass vaccination programs have transformed global mortality and morbidity patterns, and what indicators suggest this progress is at risk?",
"How do logistical, social, and structural barriers affect vaccine delivery in rural or underserved regions, and what community-based interventions have shown promise in mitigating these obstacles?",
"Which emerging therapies, including gene therapy and molecular targeted approaches, show promise in revolutionizing overall cancer treatment paradigms?",
"Which innovations in smart healthcare systems are improving disease diagnosis and clinical decision-making?",
"What key social and environmental factors have been identified as influencing mental health outcomes, and how do recent studies suggest addressing them?",
"What role do digital technologies and mobile platforms play in improving diabetes management and patient engagement?",
"What is insulin, how does it regulate blood glucose, and how has insulin therapy evolved over time?"]

with open(REQUESTS_PATH, "w") as f:
    json.dump(requests, f, indent=2)

In [None]:
# -------- STEP 3: EMBEDDINGS & FAISS INDEX --------

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Encode documents
doc_texts = [d["text"] for d in documents]
doc_embeddings = model.encode(doc_texts, show_progress_bar=True, convert_to_numpy=True)
embedding_dim = doc_embeddings.shape[1]

# Build FAISS index (cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity
faiss.normalize_L2(doc_embeddings)        # Normalize embeddings for cosine similarity
index.add(doc_embeddings)

# Encode queries
query_embeddings = model.encode(requests, convert_to_numpy=True)
faiss.normalize_L2(query_embeddings)

In [None]:
# -------- STEP 4: MANUAL BASELINE --------

manual_baseline = {
    "Based on the collective perspective of recent literature, what are the major biological, technological, and sociopolitical barriers that still hinder full malaria eradication despite modern advances?": [
    "WHO-guidelines-for-malaria.pdf",
    "Malaria - Past, Present and Future.pdf",
	"An update on Malaria.txt",
	"Malaria - An Overview.pdf",
	"Malaria Vaccines - Recent Advances and New Horizons.pdf"
    ],

 "What evidence exists across sources that mass vaccination programs have transformed global mortality and morbidity patterns, and what indicators suggest this progress is at risk?": [
        "Vaccines and immunization - WHO.txt",
        "A guide to vaccinology from basic principles to new developments.pdf",
	"Childhood Vaccination Has Saved Millions of Lives, but Rising Hesitancy Could Reverse Decades of Progress.txt",
	"Enrolling a rural community pharmacy as a Vaccines for Children provider to increase HPV vaccination: a feasibility study.pdf",
	"Vaccine Safety - Examine the Evidence.txt"
    ],

 "How do logistical, social, and structural barriers affect vaccine delivery in rural or underserved regions, and what community-based interventions have shown promise in mitigating these obstacles?": [
        "Enrolling a rural community pharmacy as a Vaccines for Children provider to increase HPV vaccination: a feasibility study.pdf",
        "A guide to vaccinology from basic principles to new developments.pdf",
	"Childhood Vaccination Has Saved Millions of Lives, but Rising Hesitancy Could Reverse Decades of Progress.txt",
	"Vaccines and immunization - WHO.txt",
	"Vaccine Safety - Examine the Evidence.txt"
    ],

 "Which groups are considered at highest risk of severe influenza complications, and what medical or preventive interventions are recommended for them?": [
        "Flu CDC.txt",
        "Influenza (seasonal) WHO.txt",
	"Influenza and Influenza Vaccine_ A Review.pdf",
	"Influenza Treatment and Prevention.txt",
	"Understanding the symptoms of the common cold and influenza.pdf"
    ],

 "What are the major risk factors and early symptoms of myocardial infarction, and how does public awareness impact treatment outcomes?": [
        "Analyzing_heart_attack.txt",
        "Heart_Attacks.txt",
	"Signs_Symptoms_Heart_Attack.txt",
	"Understanding myocardial infarction.pdf",
	"The global prevalence of myocardial infarction_ a systematic review and meta-analysis.pdf"
    ],

"How can early diagnosis, evolving clinical definitions, and global prevention strategies improve sepsis outcomes and reduce mortality worldwide?": [
        "Sepsis WHO.txt",
        "SEPSIS DEFINITION - WHATS NEW  IN THE TREATMENT GUIDELINES.txt",
	"Immune dysregulation in sepsis experiences, lessons and perspectives.pdf",
	"Subtypes and Mimics of Sepsis.pdf",
	"Potential biomarker for diagnosis and therapy of sepsis - Lactylation.txt"
    ],

"Which emerging therapies, including gene therapy and molecular targeted approaches, show promise in revolutionizing overall cancer treatment paradigms?": [
        "Revolutionizing cancer care strategies: immunotherapy, gene therapy, and molecular targeted therapy.pdf",
        "The emerging clinical relevance of genomics in cancer medicine.pdf",
	"Predictive, personalized, preventive, participatory (P4) cancer medicine.pdf",
	"Immunotherapy and prevention of pancreatic cancer.pdf",
	"Tumor microenvironment: recent advances in various cancer treatments.pdf"
    ],

"Which innovations in smart healthcare systems are improving disease diagnosis and clinical decision-making?": [
        "Disease Diagnosis in Smart Healthcare: Innovation, Technologies and Applications.pdf",
        "Machine learning for precision medicine1.pdf",
	"Artificial intelligence in healthcare - transforming the practice of medicine.txt",
	"Harnessing the Digital Revolution: A Comprehensive Re- view of mHealth Applications for Remote Monitoring in  Transforming Healthcare Delivery.pdf",
	"An Ensemble of Deep Convolutional Neural Networks for Alzheimer’s Disease Detection and Classification.pdf"
    ],

"What key social and environmental factors have been identified as influencing mental health outcomes, and how do recent studies suggest addressing them?": [
        "What is mental health? Evidence towards a new definition from a mixed methods multidisciplinary international survey.pdf",
        "What is good mental health? A scoping review.pdf",
	"MENTAL HEALTH AND MENTAL DISORDERS - A RURAL CHALLENGE: A LITERATURE REVIEW.pdf",
	"The Use of Social Networking Sites in Mental Health Interventions for Young People: Systematic Review.pdf",
	"No_health_without mental health.pdf"
    ],


"What role do digital technologies and mobile platforms play in improving diabetes management and patient engagement?": [
        "Standards of Medical Care in Diabetes.pdf",
        "Accessibility and Openness to Diabetes Management Support With Mobile Phones-Survey Study of People With Type 1 Diabetes Using Advanced Diabetes Technologies.pdf",
	"Role_Of_Digital_Health_Technology_Interventions.txt",
	"Digital technologies for prediabetes_ A systematic review and meta-analysis.pdf",
	"Open-source Web Portal for Managing Self-reported Data and Real-world Data Donation in Diabetes Research- Platform Feasibility Study.pdf"
    ],


"What is insulin, how does it regulate blood glucose, and how has insulin therapy evolved over time?": [
        "Evolution_of_Insulin.txt",
        "100 Years of Insulin.pdf",
	"Standards of Medical Care in Diabetes.pdf",
	"Diabetes_ Recent Advances and Future Perspectives.pdf",
	"Accessibility and Openness to Diabetes Management Support With Mobile Phones-Survey Study of People With Type 1 Diabetes Using Advanced Diabetes Technologies.pdf"
    ],
	
}

with open(MANUAL_BASELINE_PATH, "w") as f:
    json.dump(manual_baseline, f, indent=2)

In [None]:
# -------- STEP 5: RETRIEVAL --------

import time

results = {}
total_time = 0.0

for q_idx, query in enumerate(requests):
    start_time = time.time()
    
    # encode the query
    q_emb = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    
    # FAISS search
    D, I = index.search(q_emb, TOP_K)
    retrieved_ids = [documents[i]["id"] for i in I[0]]
    
    elapsed = time.time() - start_time
    total_time += elapsed
    
    results[query] = retrieved_ids

print(total_time)

avg_latency = total_time / len(requests)
print(f"Average retrieval latency per query: {avg_latency:.4f} seconds")

with open(RESULTS_PATH, "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved FAISS retrieval results to {RESULTS_PATH}")

In [None]:
# -------- STEP 6: EVALUATION --------

import numpy as np
from sklearn.metrics import ndcg_score

def evaluate_retrieval_v2(results, manual_baseline, k):
    recall_scores, precision_scores, f1_scores, mrr_scores = [], [], [], []
    hit_rate_scores, ndcg_scores = [], []
    precision_at_1, recall_at_1 = [], []
    qualitative = {}
    
    num_queries = len(manual_baseline)
    correct_top1_count = 0  # For accuracy

    for q, relevant_docs in manual_baseline.items():
        retrieved = results.get(q, [])[:k]

        # Binary relevance vector for top-k
        y_true = [1 if doc in relevant_docs else 0 for doc in retrieved]
        y_pred = [1] * len(retrieved) 
        
        # Recall@k
        recall_k = sum(y_true) / len(relevant_docs) if relevant_docs else 0.0
        recall_scores.append(recall_k)

        # Precision@k
        precision_k = sum(y_true) / len(retrieved) if retrieved else 0.0
        precision_scores.append(precision_k)

        # F1@k
        f1_k = (2 * precision_k * recall_k) / (precision_k + recall_k) if (precision_k + recall_k) > 0 else 0.0
        f1_scores.append(f1_k)

        # MRR
        rank = None
        for idx, doc in enumerate(retrieved):
            if doc in relevant_docs:
                rank = idx + 1
                break
        mrr_scores.append(1.0 / rank if rank else 0.0)

        # Hit Rate
        hit_rate_scores.append(1.0 if rank else 0.0)

        # nDCG@k
        if retrieved:
            y_true_array = np.array([y_true])
            y_score_array = np.ones_like(y_true_array)  # all equal score
            ndcg = ndcg_score(y_true_array, y_score_array, k=k)
        else:
            ndcg = 0.0
        ndcg_scores.append(ndcg)

        # Precision@1 and Recall@1
        top1 = retrieved[0] if retrieved else None
        precision1 = 1.0 if top1 in relevant_docs else 0.0
        recall1 = 1.0 / len(relevant_docs) if top1 in relevant_docs else 0.0
        precision_at_1.append(precision1)
        recall_at_1.append(recall1)

        # Accuracy (top-1 correct)
        if top1 in relevant_docs:
            correct_top1_count += 1

        qualitative[q] = {
            "retrieved": retrieved,
            "relevant": relevant_docs,
            "recall@k": recall_k,
            "precision@k": precision_k,
            "f1@k": f1_k,
            "mrr": mrr_scores[-1],
            "hit": hit_rate_scores[-1],
            "ndcg@k": ndcg,
            "precision@1": precision1,
            "recall@1": recall1
        }

    accuracy = correct_top1_count / num_queries if num_queries > 0 else 0.0

    metrics = {
        "Precision@k": np.mean(precision_scores),
        "Recall@k": np.mean(recall_scores),
        "F1@k": np.mean(f1_scores),
        "MRR": np.mean(mrr_scores),
        "HitRate": np.mean(hit_rate_scores),
        "nDCG@k": np.mean(ndcg_scores),
        "Accuracy@1": accuracy,
        "Precision@1": np.mean(precision_at_1),
        "Recall@1": np.mean(recall_at_1),
        "examples": qualitative
    }

    return metrics

evaluation_v2 = evaluate_retrieval_v2(results, manual_baseline, TOP_K)
print(json.dumps(evaluation_v2, indent=2))