In [None]:
# Uncomment the following block to install required libraries
# """
# !pip install langchain chromadb sentence-transformers
# !pip install  openai tiktoken
# !pip install jq
# !pip install faiss
# !pip install pymilvus
# """

In [None]:
!pip install ujson
import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection


In [None]:
!pip install llama-index-core transformers torch
!pip install llama-index-postprocessor-colbert-rerank
!pip install llama-index-embeddings-huggingface

In [None]:
!pip install -qU bitsandbytes transformers accelerate

In [None]:
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')




* Load the PubMed articles from the JSON file. To prepare the JSON file, please refer to the script `download_pubmed.py`

In [None]:
!pip install -U langchain langchain-community
!pip install jq
#load public article from json
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:
    # Define the metadata extraction function.
    metadata["year"] = record.get("pub_date").get('year')
    metadata["month"] = record.get("pub_date").get('month')
    metadata["day"] = record.get("pub_date").get('day')
    metadata["title"] = record.get("article_title")

    return metadata

loader = JSONLoader(
    file_path='/content/drive/MyDrive/finalproject/pubmed_article.json',
    jq_schema='.[]',
    content_key='article_abstract',
    metadata_func=metadata_func)
data = loader.load()
print(f"{len(data)} pubmed articles are loaded!")
data[1]

- Chunk abstracts into small text passages for efficient retrieval and LLM context length

In [None]:
!pip install tiktoken
#只对page content 做chunking
from langchain.text_splitter import TokenTextSplitter,CharacterTextSplitter
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=64)
chunks = text_splitter.split_documents(data)
print(f"{len(data)} pubmed articles are converted to {len(chunks)} text fragments!")
chunks[0]

In [None]:
# Option a: using all-mpnet from SentenceTransformer
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# # Option b: using e5-large-unspupervised from huggingface
# from langchain.embeddings import HuggingFaceEmbeddings
# modelPath = "intfloat/e5-large-unsupervised"
# embeddings = HuggingFaceEmbeddings(
#   model_name = modelPath,
#   model_kwargs = {'device':'cuda'},
#   encode_kwargs={'normalize_embeddings':False}
# )



# from langchain.embeddings import SentenceTransformerEmbeddings

# embeddings = SentenceTransformerEmbeddings(
#     model_name="intfloat/e5-large-v2",
#     model_kwargs={"device": "cuda"}
# )


In [None]:
!pip install faiss-gpu-cu12


In [None]:

!pip install chromadb
from langchain.embeddings import HuggingFaceEmbeddings

# Option a: Using chroma database
from langchain.vectorstores import Chroma
# db = Chroma.from_documents(chunks, embeddings)
# 用正确的 1024 维模型加载
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")


db = Chroma.from_documents(
    chunks,
    embeddings,
    persist_directory="./chroma_db"
)


'''
# Option b: Using Milvus database
# To run the following code, you should have a milvus instance up and running
# Follow the instructions in the following the link: https://milvus.io/docs/install_standalone-docker.md
from langchain.vectorstores import Milvus
db = Milvus.from_documents(
    chunks,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
)
'''

# # Using faiss index
# from langchain.vectorstores import FAISS
# db = FAISS.from_documents(chunks, embeddings)

In [None]:
# Install required packages
# First uninstall existing versions
# !pip install -U bitsandbytes

# # Install with CUDA 11.x support (works for most systems)
# !pip install -U bitsandbytes>=0.41.1
# !pip install -U transformers accelerate



from huggingface_hub import notebook_login
notebook_login()

# # Verify installation
# import bitsandbytes
# print(f"bitsandbytes version: {bitsandbytes.__version__}")

# Import libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)
from langchain_community.llms import HuggingFacePipeline

# # 4-bit quantization config
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4"
# )

# Load model (Mistral-7B)
model_id = "mistralai/Mistral-7B-v0.1"


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,

    device_map="auto"
)

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=64,
    temperature=0.1,
    do_sample=True
)

# LangChain wrapper
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import time

# PROMPT 1
PROMPT_TEMPLATE_1 = """Answer the question based only on the following context:
{context}
You are allowed to rephrase the answer based on the context.
Question: {question}
"""
PROMPT1 = PromptTemplate.from_template(PROMPT_TEMPLATE_1)

# # PROMPT 2
# PROMPT_TEMPLATE_2="Your are a medical assistant for question-answering tasks. Answer the Question using the provided Contex only. Your answer should be in your own words and be no longer than 128 words. \n\n Context: {context} \n\n Question: {question} \n\n Answer:"
# PROMPT2 = PromptTemplate.from_template(PROMPT_TEMPLATE_2)

# 1. 修正Prompt
PROMPT_TEMPLATE_2 = """Answer based ONLY on:
Context: {context}
Question: {question}
Concise medical answer (max 100 words):"""
PROMPT2 = PromptTemplate.from_template(PROMPT_TEMPLATE_2)


# # 3. 执行查询
# start_time = time.time()
# result = qa_chain.invoke({"query": "Alzheimer's treatments"})
# print(f"Time: {time.time() - start_time:.2f}s")
# print(result["result"])
# PROMPT 3
# from langchain import hub
# PROMPT3 = hub.pull("rlm/rag-prompt", api_url="https://api.hub.langchain.com")

# # RAG pipeline
# qa_chain = RetrievalQA.from_chain_type(
#     llm,
#     retriever=db.as_retriever(k=4),
#     chain_type_kwargs={"prompt": PROMPT2},
#     return_source_documents=True
# )


In [None]:
# start_time = time.time()
# # query = "What are the safest cryopreservation methods?"
# query="What are the recent advancements in the treatment of Alzheimer’s disease?"
# result = qa_chain({"query": query})
# print(f"\n--- {time.time() - start_time} seconds ---")

In [None]:
# # Define the langchain pipeline for llm only
# from langchain.prompts import PromptTemplate
# PROMPT_TEMPLATE ="""Answer the given Question only. Your answer should be in your own words and be no longer than 100 words. \n\n Question: {question} \n\n
# Answer:
# """
# PROMPT = PromptTemplate.from_template(PROMPT_TEMPLATE)
# llm_chain = PROMPT | llm
# start_time = time.time()
# result = llm_chain.invoke({"question": query})
# print(f"\n--- {time.time() - start_time} seconds ---")
# print(result)

In [None]:
# import torch
# print(torch.cuda.is_available())


# #Indexing


# nbits = 2   # encode each dimension with 2 bits
# doc_maxlen = 128 # truncate passages at 300 tokens
# max_id = 20
# checkpoint = "colbert-ir/colbertv2.0"


# collection = [chunk.page_content for chunk in chunks[:max_id]]  # 限制了 max_id=2400，所以截取前 2400 条


# index_name = 'pubmed_index_small'

# with Run().context(RunConfig(nranks=1, experiment="/content/drive/MyDrive/finalproject/exp_pubmed")):

#     config = ColBERTConfig(doc_maxlen=128, nbits=2, kmeans_niters=2)
#     indexer = Indexer(checkpoint=checkpoint, config=config)
#     indexer.index(name=index_name, collection=collection, overwrite=True)

In [None]:
!pip install -U sentence-transformers
from sentence_transformers import CrossEncoder


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings




rerank_model_name = "BAAI/bge-reranker-base"
tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer.pad_token = tokenizer.eos_token  #
def rerank_bge(query, documents, top_k):
    scores = []
    for doc in documents:
        inputs = tokenizer(
            query, doc.page_content,
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(model.device)

        with torch.no_grad():
            logits = model(**inputs).logits
            score = logits.squeeze().item()  #

        scores.append((doc, score))

    #sort by scores(descending)

    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
     # Deduplicate by title (for this query)
    seen_titles = set()
    reranked_unique = []
    # reranked = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    # for i, (doc, score) in enumerate(reranked):
    #     print(f"Reranked #{i+1} | Score: {score:.4f}")
    #     print(f"Title: {doc.metadata.get('title')}")
    #     print(f"Content: {doc.page_content[:300]}...\n")
    # return [doc for doc, score in reranked]

    for doc, score in sorted_scores:
        title = doc.metadata.get("title", None)

        if title and title not in seen_titles:
            seen_titles.add(title)
            reranked_unique.append((doc, score))

        if len(reranked_unique) >= top_k:
            break

    # Print results
    for i, (doc, score) in enumerate(reranked_unique):
        print(f"Reranked #{i+1} | Score: {score:.4f}")
        print(f"Title: {doc.metadata.get('title')}")
        print(f"Content: {doc.page_content[:300]}...\n")

    return [doc for doc, score in reranked_unique]


# query = "What are the recent advancements in the treatment of Alzheimer’s disease?"
# retrieved_docs = db.as_retriever(k=8).get_relevant_documents(query)

# # rerank
# reranked_docs = rerank_bge(query, retrieved_docs, top_k=7)


# print(f"\n Query: {query}")
# print(f" Top {len(reranked_docs)} reranked results:\n")
# for i, doc in enumerate(reranked_docs):
#     print(f"Reranked #{i+1}:")
#     print(f"Title: {doc.metadata.get('title')}")
#     print(f"Content: {doc.page_content[:300]}...\n")


In [None]:
import os
os.listdir("./chroma_db")


In [None]:
!pip install ragas

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import Faithfulness
import asyncio


In [None]:
import ragas
import langchain_core
import importlib.metadata  # Python 3.8+

# Method 1: Using importlib (most reliable)
print(f"RAGAS version: {importlib.metadata.version('ragas')}")
print(f"LangChain Core: {importlib.metadata.version('langchain-core')}")
print(f"LangChain OpenAI: {importlib.metadata.version('langchain-openai')}")

# Method 2: Alternative for older Python
try:
    from pip._internal.operations import freeze
    pkgs = freeze.freeze()
    print("\nAll installed packages:")
    for pkg in pkgs:
        if 'ragas' in pkg.lower() or 'langchain' in pkg.lower():
            print(pkg)
except ImportError:
    !pip list | grep -E 'ragas|langchain'

In [None]:
# Installation (if needed)
!pip install -U ragas langchain-openai datasets

import os
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

queries = [
   "How does sonodynamic therapy (SDT) differ from conventional antibiotics in terms of combating multidrug-resistant bacterial infections?",
   "Does zebrafish possess an ortholog or paralog of mammalian calprotectin that exhibits antimicrobial activity and can activate inflammation via Toll-like receptor 4?",
   "What is the current understanding of gray matter alterations in patients with vestibular migraine?",
   "How do nickel clusters improve the hydrogen evolution reaction (HER) and what potential applications does this have in renewable energy conversion?",



]

reference_answers = [
  " Sonodynamic therapy (SDT) leverages the generation of reactive oxygen species to inflict multifaceted damage on bacterial cells, reducing the likelihood of developing drug resistance, unlike conventional antibiotics.",
   "No, zebrafish do not have an ortholog of either mammalian S100A8 or S100A9, and none of the identified zebrafish s100 proteins exhibited antimicrobial activity comparable to mammalian calprotectin. Additionally, none of the zebrafish proteins activated inflammation via Toll-like receptor 4, suggesting that similar proteins have not convergently evolved analogous functions.",
   "The current understanding of gray matter alterations in patients with vestibular migraine remains lacking, despite the growing amount of neuroimaging data in recent decades.",
   "clusters considerably improve the hydrogen evolution reaction (HER), indicating their promise for renewable energy conversion."
]


all_contexts = []
for query in queries:
    retrieved_docs = db.as_retriever(k=15).get_relevant_documents(query)  # 
    reranked_docs = rerank_bge(query, retrieved_docs, top_k=4)          # 
    all_contexts.append([doc.page_content for doc in reranked_docs])
  

# 2. Initialize wrappers
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# 3. Define your test data (modified for vestibular migraine example)
test_data = {
    "question": queries,
    # "answer": ["n/a"] * len(queries),
    "contexts": all_contexts,
   "ground_truth": reference_answers
}

# 4. Convert to Dataset format
dataset = Dataset.from_dict(test_data)

# 5. Define metrics (using newer RAGAS syntax)
from ragas.metrics import  context_precision

# # Correct way to access results in RAGAS ≥0.2.0
# context_precision_score = results['context_precision'][0]  # Get first (and only) score
# print(f"Context Precision Score: {context_precision_score:.2f}")




print("\n evaluating retrieval quality...")
results = evaluate(
    dataset,
    metrics=[context_precision],
    llm=evaluator_llm
)


print("\n Evaluation Results:")
print("="*50)
for i, (query, score) in enumerate(zip(queries, results['context_precision'])):
    print(f"\nQuestions {i+1}: {query}...")
    print(f"Context Precision: {score:.2f}/1.0")
    print(f" Number of relevant documents:: {len(all_contexts[i])}")
    print(" Most relevant document summaries:")
    for j, ctx in enumerate(all_contexts[i][:2]):  
        print(f"  {j+1}. {ctx[:80]}...")
print("="*50)



In [None]:
# Installation (if needed)
!pip install -U ragas langchain-openai datasets

import os
from ragas.metrics import context_recall
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

queries = [
   "How does sonodynamic therapy (SDT) differ from conventional antibiotics in terms of combating multidrug-resistant bacterial infections?",
   "Does zebrafish possess an ortholog or paralog of mammalian calprotectin that exhibits antimicrobial activity and can activate inflammation via Toll-like receptor 4?",
   "What is the current understanding of gray matter alterations in patients with vestibular migraine?",
   "How do nickel clusters improve the hydrogen evolution reaction (HER) and what potential applications does this have in renewable energy conversion?",



]

reference_answers = [
  " Sonodynamic therapy (SDT) leverages the generation of reactive oxygen species to inflict multifaceted damage on bacterial cells, reducing the likelihood of developing drug resistance, unlike conventional antibiotics.",
   "No, zebrafish do not have an ortholog of either mammalian S100A8 or S100A9, and none of the identified zebrafish s100 proteins exhibited antimicrobial activity comparable to mammalian calprotectin. Additionally, none of the zebrafish proteins activated inflammation via Toll-like receptor 4, suggesting that similar proteins have not convergently evolved analogous functions.",
   "The current understanding of gray matter alterations in patients with vestibular migraine remains lacking, despite the growing amount of neuroimaging data in recent decades.",
   "clusters considerably improve the hydrogen evolution reaction (HER), indicating their promise for renewable energy conversion."
]


all_contexts = []
for query in queries:
    retrieved_docs = db.as_retriever(k=30).get_relevant_documents(query)  # 替换为您的检索系统
    reranked_docs = rerank_bge(query, retrieved_docs, top_k=10)          # 替换为您的重排序
    all_contexts.append([doc.page_content for doc in reranked_docs])
   

# 2. Initialize wrappers
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# 3. Define your test data (modified for vestibular migraine example)
test_data = {
    "question": queries,
    # "answer": ["n/a"] * len(queries),
    "contexts": all_contexts,
   "ground_truth": reference_answers
}

# 4. Convert to Dataset format
dataset = Dataset.from_dict(test_data)

# 5. Define metrics (using newer RAGAS syntax)
from ragas.metrics import  context_recall

# # Correct way to access results in RAGAS ≥0.2.0
# context_precision_score = results['context_precision'][0]  # Get first (and only) score
# print(f"Context Precision Score: {context_precision_score:.2f}")




print("\n 正在评估检索质量...")
results = evaluate(
    dataset,
    metrics=[context_recall],
    llm=evaluator_llm
)


print("\n Evaluating Results :")
print("="*50)
for i, (query, score) in enumerate(zip(queries, results['context_recall'])):
    print(f"\nQuestion {i+1}: {query}...")
    print(f"• Context recall: {score:.2f}/1.0")
    print(f"• number of relevant documents: {len(all_contexts[i])}")
    print("• Most relevant document summaries :")
    for j, ctx in enumerate(all_contexts[i][:2]):  # 显示前两个文档
        print(f"  {j+1}. {ctx[:380]}...")
print("="*50)
# print("\n💡 评分说明: 1.0表示完美匹配，0.5以下需改进检索系统")

In [None]:
# Installation (if needed)
!pip install -U ragas langchain-openai datasets

import os

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness  # 主要修改点
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generated_response=[]
queries = [
   "How does sonodynamic therapy (SDT) differ from conventional antibiotics in terms of combating multidrug-resistant bacterial infections?",
   "Does zebrafish possess an ortholog or paralog of mammalian calprotectin that exhibits antimicrobial activity and can activate inflammation via Toll-like receptor 4?",
   "What is the current understanding of gray matter alterations in patients with vestibular migraine?",
   "How do nickel clusters improve the hydrogen evolution reaction (HER) and what potential applications does this have in renewable energy conversion?",



]

reference_answers = [
  " Sonodynamic therapy (SDT) leverages the generation of reactive oxygen species to inflict multifaceted damage on bacterial cells, reducing the likelihood of developing drug resistance, unlike conventional antibiotics.",
   "No, zebrafish do not have an ortholog of either mammalian S100A8 or S100A9, and none of the identified zebrafish s100 proteins exhibited antimicrobial activity comparable to mammalian calprotectin. Additionally, none of the zebrafish proteins activated inflammation via Toll-like receptor 4, suggesting that similar proteins have not convergently evolved analogous functions.",
   "The current understanding of gray matter alterations in patients with vestibular migraine remains lacking, despite the growing amount of neuroimaging data in recent decades.",
   "clusters considerably improve the hydrogen evolution reaction (HER), indicating their promise for renewable energy conversion."
]


all_contexts = []
for query in queries:
    retrieved_docs = db.as_retriever(k=15).get_relevant_documents(query)  # 替换为您的检索系统
    reranked_docs = rerank_bge(query, retrieved_docs, top_k=4)          # 替换为您的重排序
    all_contexts.append([doc.page_content for doc in reranked_docs])
    # # 在您的代码中，在构建 all_contexts 后添加：
    # print("\n=== all_contexts 结构分析 ===")
    # print(f"总查询数量: {len(all_contexts)}")
    # for i, contexts in enumerate(all_contexts):
    #     print(f"\n问题 {i+1} 的文档内容:")
    #     print(f"共检索到 {len(contexts)} 个文档")
    #     for j, content in enumerate(contexts):
    #         print(f"[文档 {j+1}] 长度:{len(content)} 字符")
    #         print(f"内容预览: {content[:100]}...")  # 只打印前100个字符

    # Step 2: 拼接 prompt
    context_text = "\n\n".join(contexts)
    prompt = f"""Answer based ONLY on the context below.

Context:
{context_text}

Question: {query}
Answer:"""

    # Step 3: LLM 推理（RAG 生成的回答）
    rag_response = evaluator_llm.generate(prompt)  # 如果你用的是 ChatOpenAI 包装过的 LangchainLLMWrapper
    generated_responses.append(rag_response.strip())






# 2. Initialize wrappers
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# 3. Define your test data (modified for vestibular migraine example)
test_data = {
    "question": queries,
    # "answer": ["n/a"] * len(queries),

    "contexts": all_contexts,
    "response":
}

# 4. Convert to Dataset format
dataset = Dataset.from_dict(test_data)

# 5. Define metrics (using newer RAGAS syntax)
from ragas.metrics import  context_recall

# # Correct way to access results in RAGAS ≥0.2.0
# context_precision_score = results['context_precision'][0]  # Get first (and only) score
# print(f"Context Precision Score: {context_precision_score:.2f}")




print("\n evaluating generating quality...")
results = evaluate(
    dataset,
    metrics=[faithfulness],
    llm=evaluator_llm
)


print("\nevaluating results:")
print("="*50)
for i, (query, score) in enumerate(zip(queries, results['faithfulness'])):
    print(f"\nquestion {i+1}: {query[:60]}...")
    print(f"• Faithfulness: {score:.2f}/1.0")
    print(f"• the number of documents used: {len(all_contexts[i])}")
    print("• document content verification:")
    for j, ctx in enumerate(all_contexts[i][:2]):
        print(f"  {j+1}. {ctx[:80]}...")
print("="*50)


In [None]:
#  Step 0: Installation (skip if already installed)
!pip install -U ragas langchain-openai datasets

#  Step 1: Imports & API Key
import nest_asyncio
nest_asyncio.apply()

import os
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import faithfulness  # Optional: add more metrics later

queries = [
   "How does sonodynamic therapy (SDT) differ from conventional antibiotics in terms of combating multidrug-resistant bacterial infections?",
   "Does zebrafish possess an ortholog or paralog of mammalian calprotectin that exhibits antimicrobial activity and can activate inflammation via Toll-like receptor 4?",
   "What is the current understanding of gray matter alterations in patients with vestibular migraine?",
   "How do nickel clusters improve the hydrogen evolution reaction (HER) and what potential applications does this have in renewable energy conversion?",



]

reference_answers = [
  " Sonodynamic therapy (SDT) leverages the generation of reactive oxygen species to inflict multifaceted damage on bacterial cells, reducing the likelihood of developing drug resistance, unlike conventional antibiotics.",
   "No, zebrafish do not have an ortholog of either mammalian S100A8 or S100A9, and none of the identified zebrafish s100 proteins exhibited antimicrobial activity comparable to mammalian calprotectin. Additionally, none of the zebrafish proteins activated inflammation via Toll-like receptor 4, suggesting that similar proteins have not convergently evolved analogous functions.",
   "The current understanding of gray matter alterations in patients with vestibular migraine remains lacking, despite the growing amount of neuroimaging data in recent decades.",
   "clusters considerably improve the hydrogen evolution reaction (HER), indicating their promise for renewable energy conversion."
]


from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage

# PROMPT_TEMPLATE_2 = """Answer based ONLY on:
# Context: {context}
# Question: {question}
# Concise medical answer (max 100 words):"""
# PROMPT2 = PromptTemplate.from_template(PROMPT_TEMPLATE_2)



# Step 3: Initialize LLM and embedding wrappers for RAGAS
llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Step 4: Retrieval + Generation (Replace with your real retrieval + reranker)
all_contexts = []
generated_responses = []

for query in queries:
    #  Replace these two lines with your real retrieval system (e.g., Chroma, FAISS)
    retrieved_docs = db.as_retriever(k=15).get_relevant_documents(query)
    reranked_docs = rerank_bge(query, retrieved_docs, top_k=4)

    #  Extract context text
    context_list = [doc.page_content for doc in reranked_docs]
    all_contexts.append(context_list)

    context_text = "\n\n".join(context_list)
    # Create proper prompt format for the LLM
    prompt = f"""Answer based ONLY on:
    Context: {context_text}
    Question: {query}
    Concise medical answer (max 200 words):"""


    messages = [HumanMessage(content=prompt)]
   
    response = await llm_wrapper.generate([messages])

    response = await llm_wrapper.generate(prompt)
    generated_response = response.generations[0][0].text
    generated_responses.append(generated_response.strip())

# Step 5: Prepare data in HuggingFace format
test_data = {
    "question": queries,
    "contexts": all_contexts,
    "response": generated_responses,
}

dataset = Dataset.from_dict(test_data)

# Step 6: Evaluate
print("\n evaluating Faithfulness...")
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness],
    llm=llm_wrapper
)

#  Step 7: Show Results
print("\n generating results:")
print("=" * 60)
for i, (q, score) in enumerate(zip(queries, results["faithfulness"])):
    print(f"\nquestion {i+1}: {q[:60]}...")
    print(f"• Faithfulness: {score:.2f}/1.0")
    print(f"• the number of documents used: {len(all_contexts[i])}")
    print("• context summary:")
    for j, ctx in enumerate(all_contexts[i][:2]):
        print(f"  - [{j+1}] {ctx[:100]}...")


In [None]:
# Step 0: Installation (skip if already installed)
!pip install -U ragas langchain-openai datasets

# Step 1: Imports & API Key
import nest_asyncio
nest_asyncio.apply()

import os
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import faithfulness, answer_relevancy  # Added answer_relevancy
from langchain_core.messages import HumanMessage
from ragas.metrics import ResponseRelevancy  # Import ResponseRelevancy

queries = [
   "How does sonodynamic therapy (SDT) differ from conventional antibiotics in terms of combating multidrug-resistant bacterial infections?",
   "Does zebrafish possess an ortholog or paralog of mammalian calprotectin that exhibits antimicrobial activity and can activate inflammation via Toll-like receptor 4?",
   "What is the current understanding of gray matter alterations in patients with vestibular migraine?",
   "How do nickel clusters improve the hydrogen evolution reaction (HER) and what potential applications does this have in renewable energy conversion?",



]

reference_answers = [
  " Sonodynamic therapy (SDT) leverages the generation of reactive oxygen species to inflict multifaceted damage on bacterial cells, reducing the likelihood of developing drug resistance, unlike conventional antibiotics.",
   "No, zebrafish do not have an ortholog of either mammalian S100A8 or S100A9, and none of the identified zebrafish s100 proteins exhibited antimicrobial activity comparable to mammalian calprotectin. Additionally, none of the zebrafish proteins activated inflammation via Toll-like receptor 4, suggesting that similar proteins have not convergently evolved analogous functions.",
   "The current understanding of gray matter alterations in patients with vestibular migraine remains lacking, despite the growing amount of neuroimaging data in recent decades.",
   "clusters considerably improve the hydrogen evolution reaction (HER), indicating their promise for renewable energy conversion."
]

# ✅ Step 3: Initialize LLM and embedding wrappers for RAGAS
llm = ChatOpenAI(model="gpt-4o")
llm_wrapper = LangchainLLMWrapper(llm)
embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# Initialize ResponseRelevancy scorer
response_relevancy = ResponseRelevancy(
    llm=llm_wrapper,
    embeddings=embedding_wrapper
)

# ✅ Step 4: Retrieval + Generation
all_contexts = []
generated_responses = []
relevancy_scores = []

for query in queries:


    retrieved_docs = db.as_retriever(search_kwargs={"k": 15}).get_relevant_documents(query)
    reranked_docs = rerank_bge(query, retrieved_docs, top_k=4)

    context_list = [doc.page_content for doc in reranked_docs]
    all_contexts.append(context_list)


    context_text = "\n\n".join(context_list)
    prompt = f"""Answer based ONLY on:
    Context: {context_text}
    Question: {query}
    Concise medical answer (max 150 words):"""

    messages = [HumanMessage(content=prompt)]
    response = await llm.agenerate([messages])
    generated_response = response.generations[0][0].text.strip()
    generated_responses.append(generated_response)

    # Calculate response relevancy score
    sample = {
        "user_input": query,
        "response": generated_response,
        "retrieved_contexts": context_list
    }
    relevancy_score = await response_relevancy.ascore(sample)
    relevancy_scores.append(relevancy_score)

# ✅ Step 5: Prepare data in HuggingFace format
test_data = {
    "question": queries,
    "contexts": all_contexts,
    "response": generated_responses,
}

dataset = Dataset.from_dict(test_data)

# ✅ Step 6: Evaluate with multiple metrics
print("\n Evaluating Metrics...")
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy],  # Added answer_relevancy
    llm=llm_wrapper,
    embeddings=embedding_wrapper
)

# ✅ Step 7: Show Results
print("\n Evaluation Results")
print("=" * 60)
for i, (q, faith_score, rel_score, manual_rel_score) in enumerate(zip(
    queries,
    results["faithfulness"],
    results["answer_relevancy"],
    relevancy_scores
)):
    print(f"\n Question {i+1}: {q[:60]}...")
    print(f"• Faithfulness: {faith_score:.2f}/1.0")
    print(f"• Answer Relevancy (RAGAS): {rel_score:.2f}/1.0")
    # print(f"• Response Relevancy (Direct): {manual_rel_score:.2f}/1.0")
    print(f"• Documents used: {len(all_contexts[i])}")
    print("• Context summary:")
    for j, ctx in enumerate(all_contexts[i][:2]):
        print(f"  - [{j+1}] {ctx[:100]}...")
print("=" * 60)
print("\n💡 Interpretation:")
print("- Faithfulness: 1.0 = fully consistent with documents")
print("- Relevancy: 1.0 = perfectly addresses the question")
print("- <0.5 scores indicate potential issues")







# Enhanced Context Analysis
print("\n Deep Context Analysis")
print("=" * 80)
for i, (q, contexts) in enumerate(zip(queries, all_contexts)):
    print(f"\n Question {i+1}: {q}")
    print(f"\n Generated Answer ({len(generated_responses[i])} chars):")
    print(f"{generated_responses[i]}")

    print(f"\n Retrieved Contexts ({len(contexts)}):")
    for j, ctx in enumerate(contexts):
        print(f"\n  - Context {j+1} ({len(ctx)} chars):")
        print(ctx if len(ctx) <= 800 else ctx[:800] + "... [truncated]")

    print(f"\n⚡ Scores:")
    print(f"   Faithfulness: {results['faithfulness'][i]:.2f} | Answer Relevancy: {results['answer_relevancy'][i]:.2f} | Response Relevancy: {relevancy_scores[i]:.2f}")
    print(f"\n Problem Diagnosis:")
    if "Sample context about" in contexts[0]:
        print("    Critical: Using placeholder contexts instead of real documents")
    elif results['faithfulness'][i] < 0.5:
        print("    Low Faithfulness: Answer not grounded in contexts")
    elif results['answer_relevancy'][i] < 0.5:
        print("    Low Relevancy: Answer doesn't match question intent")
    else:
        print("    Good scores (but verify contexts are real)")
print("=" * 80)