## Environment Setup

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Ensure the environment variables are set
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

# Set environment variables for the application
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [4]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM

## Load Multiple PDF Documents

In [7]:
import fitz  # PyMuPDF

pdf_files = ["/Users/saifmohammed/Desktop/DiabetIQ/LLM-1/PDFs/Bright_Spots_and_Landmines_by_Adam_Brown.pdf", 
             "/Users/saifmohammed/Desktop/DiabetIQ/LLM-1/PDFs/Diabetes_Care_BADAS_guideline2019-3.pdf", 
             "/Users/saifmohammed/Desktop/DiabetIQ/LLM-1/PDFs/Textbook-of-Diabetes-2024.pdf"]  
pdf_texts = []

for pdf in pdf_files:
    doc = fitz.open(pdf)
    text = ""

    for page in doc:
        text += page.get_text()

    pdf_texts.append(text)  # Store extracted text

# Print only the first 500 characters of each PDF
for idx, text in enumerate(pdf_texts, start=1):
    print(f"Text from {pdf_files[idx-1]} (first 500 chars):\n", text[:500], "\n" + "="*50)


Text from /Users/saifmohammed/Desktop/DiabetIQ/LLM-1/PDFs/Bright_Spots_and_Landmines_by_Adam_Brown.pdf (first 500 chars):
 “A superb and compelling read... In these pages you will find years of 
lessons learned and tips you can immediately apply in your own life with 
diabetes. Adam’s personal journey can teach us all to find our Bright 
Spots, no matter what diabetes throws at us.”
DR .  F R A N CIN E KAUFMAN | Author of Diabesity; Endocrinologist at 
Children’s Hospital Los Angeles; Chief Medical Officer at Medtronic Diabetes
“For me, this is the most anticipated book ever for people living with 
diabetes.”
J OHN  
Text from /Users/saifmohammed/Desktop/DiabetIQ/LLM-1/PDFs/Diabetes_Care_BADAS_guideline2019-3.pdf (first 500 chars):
   
DIABETES CARE 
BADAS Guideline 2019 
    
  
    
  
   
  
   
P|) DAS GUELINE ON Man 
DELIT 
IGEMEN 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services
  
  
DIABETES CARE 
BADAS Guideline

## Split All the Documents into Chunks

In [10]:
# Combine all texts into one large document
full_text = "\n".join(pdf_texts)

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(full_text)

# Display first 3 chunks
for i, chunk in enumerate(chunks[:3]):  
    print(f"\n--- Chunk {i+1} ---")
    print(chunk[:1000])  # Print first 1000 characters of the chunk
    print("\n" + "-" * 70 + "\n")


--- Chunk 1 ---
“A superb and compelling read... In these pages you will find years of 
lessons learned and tips you can immediately apply in your own life with 
diabetes. Adam’s personal journey can teach us all to find our Bright 
Spots, no matter what diabetes throws at us.”
DR .  F R A N CIN E KAUFMAN | Author of Diabesity; Endocrinologist at 
Children’s Hospital Los Angeles; Chief Medical Officer at Medtronic Diabetes
“For me, this is the most anticipated book ever for people living with 
diabetes.”
J OHN  S JÖL UN D  | Founder/CEO of Timesulin; 32 years living with diabetes
“When I picked up this book, my A1c was 9.3%. After just a month, I’m 
down to 8.3% and still falling! Every person with diabetes, young or old, 
should read Bright Spots & Landmines. I plan to be one of Adam’s biggest 
success stories.” 
S TEV E M A L L IN SON | 24 years living with diabetes
“Living with diabetes, we all have good days and not-so-good days.

--------------------------------------------------

## Embedding

In [None]:
# Step 3: Generate embeddings using HuggingFace
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")

# Step 4: Store chunks in ChromaDB
vectorstore = Chroma.from_texts(chunks, embedding=embedding_model)

# Step 5: Create a retriever
retriever = vectorstore.as_retriever()

2025-03-15 16:01:01.759 python[1732:12720] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-1732-2025-03-15_16_01_01-1563232731‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.


## Chatbot Prompt & LLM Model

In [None]:
from langchain_core.prompts import PromptTemplate

# Prompt
prompt = PromptTemplate.from_template("""
You are a diabetes assistant for patients in Bangladesh. Provide concise, textbook-based advice on diabetes management. 
Keep responses short, direct, and actionable. Consider local diet, lifestyle, and healthcare practices. Avoid unnecessary details. 
If medical consultation is needed, advise accordingly.
""")

# LLM
llm = OllamaLLM(model="mistral")

In [None]:
# Post-processing
def format_docs(docs):
    return "\n".join(f"{i+1}. {doc.page_content.strip()}" for i, doc in enumerate(docs))

## Retrival & Generation

In [None]:
from langchain.schema.runnable import RunnableMap

# Chain
rag_chain = (
    RunnableMap({"context": retriever | format_docs, "question": RunnablePassthrough()})
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Question
rag_chain.invoke("What steps do I need to take for control my diabetics?")