In [2]:
import os
import sys
import json

# Ensure the project root is on the Python path so that our custom modules can be imported.
sys.path.insert(0, os.path.abspath("."))

# Import our LangChain and custom modules.
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline

from src.utils import setup_logger, save_json



# Set up our logger.
logger = setup_logger("main_rag", level=20)
print("Current working directory:", os.getcwd())


Current working directory: /Users/rahul/Desktop/fact_check/fact-check


In [3]:

clinical_docs_dir = os.path.join("data", "Clinical Files")

#  to load clinical documents
loader = DirectoryLoader(clinical_docs_dir)
documents = loader.load()

logger.info(f"Loaded {len(documents)} clinical documents.")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


2025-04-11 19:44:22,704 - main_rag - INFO - Loaded 8 clinical documents.


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

logger.info(f"Split documents into {len(chunks)} chunks.")

2025-04-11 19:44:47,197 - main_rag - INFO - Split documents into 676 chunks.


In [7]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a vector store (FAISS index) from the document chunks using our embeddings.
vectorstore = FAISS.from_documents(chunks, embeddings)
logger.info("FAISS vectorstore created from document chunks.")

2025-04-11 19:44:57,106 - main_rag - INFO - FAISS vectorstore created from document chunks.


In [17]:
# from langchain.prompts import PromptTemplate

# my_prompt = PromptTemplate(
#     template="""
# You are a fact-checking assistant. Your task is to find the sentences, tables, or figures 
# in the context that best support the following claim.

# Claim: {query}

# Context:
# {context}

# Identify and return only the exact text from the context that supports the claim.
# """,
#     # The variables that will be substituted in the template
#     input_variables=["context", "query"],
# )

In [19]:

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
llm_pipeline = pipeline("text-generation", model=model_id, max_length=256, device=-1)

llm = HuggingFacePipeline(pipeline=llm_pipeline)

# # Define a prompt template for the RetrievalQA chain. This template instructs the model to pick out the
# # exact sentences, tables, or figures from the context that support the given claim.
# prompt_template = """
# You are a fact-checking assistant. Your task is to find the sentences, tables, or figures 
# in the context that best support the following claim.

# Claim: {query}

# Context:
# {context}

# Identify and return only the exact text from the context that supports the claim.
# """

# Create the retriever once (outside the loop) to reuse for all queries.
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Set up the RetrievalQA chain using our LLM, retriever, and custom prompt.
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    # Instead of passing your raw string, pass the PromptTemplate instance
    chain_type_kwargs={"prompt": my_prompt}
)


logger.info("RetrievalQA chain set up.")


Device set to use cpu


2025-04-11 20:11:14,194 - main_rag - INFO - RetrievalQA chain set up.


In [25]:
llm_pipeline = pipeline(
    "text-generation",
    model=model_id,
    max_new_tokens=128,  # Allow new tokens to be generated
    device=-1  # Or use device=0 if you want to run on GPU (and you have resources)
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

Device set to use cpu


In [26]:
# Load marketing claims from a JSON file
claims_path = os.path.join("data", "Flublok_Claims.json")
with open(claims_path, "r") as f:
    claims_data = json.load(f)

logger.info(f"Loaded {len(claims_data['claims'])} marketing claims.")

# Create an empty output dictionary for results.
results = {"claims": []}

# Update your prompt template to use "question" instead of "query"
from langchain.prompts import PromptTemplate
my_prompt = PromptTemplate(
    template="""
You are a fact-checking assistant. Your task is to find the sentences, tables, or figures 
in the context that best support the following claim.

Claim: {question}

Context:
{context}

Identify and return only the exact text from the context that supports the claim.
""",
    input_variables=["context", "question"],
)

# Re-set up the RetrievalQA chain with the updated prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": my_prompt}
)
logger.info("RetrievalQA chain set up with prompt expecting 'question'.")

# Process each marketing claim:
for claim_item in claims_data["claims"]:
    claim = claim_item["claim"]
    
    # Call the chain with the proper input key ("question")
    response = qa_chain({"query": claim})
    
    # Build the output format for this claim.
    claim_result = {
        "claim": claim,
        "match_source": []
    }
    
    # Process the source documents returned by the chain.
    for doc in response["source_documents"]:
        doc_source = doc.metadata.get("source", "Unknown Document")
        snippet = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
        claim_result["match_source"].append({
            "document_name": doc_source,
            "matching_text": snippet
        })
    
    results["claims"].append(claim_result)
    logger.info(f"Processed claim: {claim[:50]}... with {len(claim_result['match_source'])} matches.")

logger.info("Completed evidence retrieval for all claims.")


2025-04-11 20:19:13,501 - main_rag - INFO - Loaded 9 marketing claims.
2025-04-11 20:19:13,538 - main_rag - INFO - RetrievalQA chain set up with prompt expecting 'question'.
2025-04-11 21:20:45,272 - main_rag - INFO - Processed claim: Flublok ensures identical antigenic match with WHO... with 5 matches.
2025-04-11 22:21:50,525 - main_rag - INFO - Processed claim: Flublok contains 3x the hemagglutinin (HA) antigen... with 5 matches.
2025-04-11 23:24:26,187 - main_rag - INFO - Processed claim: Cell- and egg-based flu vaccines have the potentia... with 5 matches.
2025-04-12 00:34:06,596 - main_rag - INFO - Processed claim: Recombinant technology leads to a broader immune r... with 5 matches.
2025-04-12 01:43:52,287 - main_rag - INFO - Processed claim: Vaccination with a higher-dose recombinant flu vac... with 5 matches.
2025-04-12 02:43:06,981 - main_rag - INFO - Processed claim: Flublok (quadrivalent) was evaluated in the pivota... with 5 matches.
2025-04-12 03:39:49,167 - main_rag - INF

In [27]:
output_json_path = os.path.join("results", "rag_results.json")

# Save the results dictionary into a JSON file.
save_json(results, output_json_path)
logger.info(f"Results saved to {output_json_path}")

# Optional: Print a preview of the first claim's result.
print(json.dumps(results["claims"][0], indent=2))

2025-04-12 10:45:39,016 - main_rag - INFO - Results saved to results/rag_results.json
{
  "claim": "Flublok ensures identical antigenic match with WHO- and FDA-selected flu strains.",
  "match_source": [
    {
      "document_name": "data/Clinical Files/FlublokPI.pdf",
      "matching_text": "96% of the in\ufb02uenza isolates obtained from subjects in Study 1 were not antigenically matched to the strains represented in the vaccine. An exploratory analysis of VE of Flublok against all strains, r..."
    },
    {
      "document_name": "data/Clinical Files/Treanor et al. (2011).pdf",
      "matching_text": "Only 8 isolates in the study (<5% of the total) were antigeni- cally identical to the strains contained in the vaccine. All of these viruses were A/Wisconsin/67/2005-like H3N2 viruses. Two of these oc..."
    },
    {
      "document_name": "data/Clinical Files/FlublokPI.pdf",
      "matching_text": "The efficacy of Flublok Quadrivalent is relevant to Flublok because both vaccines are