In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Ensure the environment variables are set
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

if not langchain_api_key:
    raise ValueError("LANGCHAIN_API_KEY is not set in the environment variables.")
if not huggingface_api_key:
    raise ValueError("HUGGINGFACE_API_KEY is not set in the environment variables.")

# Set environment variables for the application
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [3]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [4]:
# Load Documents (use PyPDFLoader for PDF)
file_path = r"C:\Users\User\Desktop\NSU\CSE299 Materials\LLM\Dataset\Diabetes_Care_BADAS_guideline2019-3.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

docs[0].page_content[:1000]

'DIABETES CARE \nBADAS Guideline 2019 \n          \n  \n   \n  \n   P|) \nDAS GUELINE ON Man \nDELIT IGEMEN \n  \nA Joint Initiative of \nDiabetic Association of Bangladesh \nNCDC Program, Directorate General of Health Services'

In [5]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

In [6]:
# Make splits
splits = text_splitter.split_documents(docs)

In [7]:
# Index
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=HuggingFaceEmbeddings())

retriever = vectorstore.as_retriever() 

# Decomposition

In [27]:
from langchain_huggingface import ChatHuggingFace
from langchain_core.output_parsers import StrOutputParser


# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
question = "What is Pathophysiology?"
questions = generate_queries_decomposition.invoke({"question":question})

In [28]:
questions

['Here are three potential search queries related to "Pathophysiology":',
 '',
 '1. "Definition of Pathophysiology in Medicine"',
 '2. "Examples of Common Diseases with Pathophysiological Mechanisms"',
 '3. "How Does the Human Body Respond Physiologically to Disease?"',
 '',
 'These sub-questions aim to provide a more detailed understanding of pathophysiology by addressing its definition, examples of diseases and their underlying mechanisms, and the physiological responses of the body to disease.',
 '',
 'Would you like me to generate more?']

# Answer Individually

In [30]:
# Answer each sub-question individually 

from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import ChatHuggingFace

# RAG prompt
prompt_rag = hub.pull("rlm/rag-prompt")

def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

  retrieved_docs = retriever.get_relevant_documents(sub_question)


In [31]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":context,"question":question})

'Based on the provided Q&A pairs, we can synthesize an answer to the question "What is Pathophysiology?" as follows:\n\nPathophysiology refers to the study of diseases and abnormal functions that result from changes in normal physiological processes. It involves understanding how disease develops and progresses, often through analyzing the interactions between genetic, environmental, and lifestyle factors. By examining the underlying biological mechanisms, healthcare professionals can identify potential causes and develop effective treatments for various health conditions.\n\nIn essence, pathophysiology is the investigation of the disruptions or abnormalities that occur within an organism\'s normal physiological processes, leading to disease states. It encompasses understanding the interplay between genetic predispositions, environmental influences, and lifestyle choices in the development and progression of diseases.'

ValidationError: 1 validation error for ChatHuggingFace
llm
  Field required [type=missing, input_value={'temperature': 0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing