In [24]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Ensure the environment variables are set
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

# Set environment variables for the application
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [26]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [28]:
# Load Documents (use PyPDFLoader for PDF)
file_path = r"/Users/saifmohammed/Downloads/Diabetes_Care_BADAS_guideline2019-3.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

docs[0].page_content[:1000]

'DIABETES CARE \nBADAS Guideline 2019 \n          \n          P|) DAS GUELINE ON Man \nDELIT IGEMEN \n  A Joint Initiative of \nDiabetic Association of Bangladesh \nNCDC Program, Directorate General of Health Services'

In [30]:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

for i, chunk in enumerate(splits[:3]):  # Show the first 3 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content[:1000])  # Print the first 1000 characters of the chunk
    print("\n" + "-"*70 + "\n")  # Separator between chunks


--- Chunk 1 ---
DIABETES CARE 
BADAS Guideline 2019 
          
          P|) DAS GUELINE ON Man 
DELIT IGEMEN 
  A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services

----------------------------------------------------------------------


--- Chunk 2 ---
DIABETES CARE 
BADAS Guideline 2019 
  A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services   
Diabetes Care: BADAS Guideline 2019 HEI! 1

----------------------------------------------------------------------


--- Chunk 3 ---
DIABETES CARE: BADAS GUIDELINE 2019 
Convener: Prof A K Azad Khan 
Chairman: Prof Hajera Mahtab 
Members of the steering committee 
Prof Dr AHM Enayet Hossain 
Prof Akhtar Hussain 
Prof Zafar Anmed Latif 
Prof Tofail Ahmed 
Prof Laique Ahmed Khan 
Prof Nazrul Islam Siddiqui 
Prof Md Hafizur Rahman 
Prof Abdus Saleque Mollah 
Prof Md Farid Uddin 
Prof M A Jalil Ansary 
Prof Dr MA Samad 
Prof SM

In [32]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use HuggingFace Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()

In [36]:
from langchain_ollama import OllamaLLM

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = OllamaLLM(model="mistral")

In [38]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [40]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [42]:
# Question
rag_chain.invoke("What is Pathophysiology?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


' Pathophysiology is the study of the underlying mechanisms that cause a disease to occur, specifically in the context of Diabetes Mellitus. In Type 1 diabetes, it\'s characterized by marked impairment of insulin production due to autoimmune destruction of beta cells, while in Type 2 diabetes, it involves both insulin resistance and beta-cell failure. The development of glucose intolerance in Type 2 diabetes is influenced by eight key pathways involving muscle, liver, fat cells, gastrointestinal tract, alpha-cells, kidney, brain, and beta-cells collectively known as the "ominous octet."'