# Environment Setup

In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Ensure the environment variables are set
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

if not langchain_api_key:
    raise ValueError("LANGCHAIN_API_KEY is not set in the environment variables.")
if not huggingface_api_key:
    raise ValueError("HUGGINGFACE_API_KEY is not set in the environment variables.")

# Set environment variables for the application
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [2]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Load Document

In [4]:
# Load Documents (use PyPDFLoader for PDF)
file_path = r"C:\Users\User\Desktop\NSU\CSE299 Materials\LLM\Dataset\Diabetes_Care_BADAS_guideline2019-3.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

docs[0].page_content[:1000]

'DIABETES CARE \nBADAS Guideline 2019 \n          \n  \n   \n  \n   P|) \nDAS GUELINE ON Man \nDELIT IGEMEN \n  \nA Joint Initiative of \nDiabetic Association of Bangladesh \nNCDC Program, Directorate General of Health Services'

# Splitting

In [5]:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

for i, chunk in enumerate(splits[:3]):  # Show the first 3 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content[:1000])  # Print the first 1000 characters of the chunk
    print("\n" + "-"*70 + "\n")  # Separator between chunks


--- Chunk 1 ---
DIABETES CARE 
BADAS Guideline 2019 
          
  
   
  
   P|) 
DAS GUELINE ON Man 
DELIT IGEMEN 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services

----------------------------------------------------------------------


--- Chunk 2 ---
DIABETES CARE 
BADAS Guideline 2019 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services 
  
Diabetes Care: BADAS Guideline 2019 HEI! 1

----------------------------------------------------------------------


--- Chunk 3 ---
DIABETES CARE: BADAS GUIDELINE 2019 
Convener: Prof A K Azad Khan 
Chairman: Prof Hajera Mahtab 
Members of the steering committee 
Prof Dr AHM Enayet Hossain 
Prof Akhtar Hussain 
Prof Zafar Anmed Latif 
Prof Tofail Ahmed 
Prof Laique Ahmed Khan 
Prof Nazrul Islam Siddiqui 
Prof Md Hafizur Rahman 
Prof Abdus Saleque Mollah 
Prof Md Farid Uddin 
Prof M A Jalil Ansary 
Prof Dr MA Samad 


# Embedding

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use HuggingFace Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Retrieval and Generation

In [8]:
#### RETRIEVAL and GENERATION ####
from langchain_community.llms import Ollama

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = Ollama(model="llama3.2")

  llm = Ollama(model="llama3.2")


In [9]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [10]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [11]:
# Question
rag_chain.invoke("What is Pathophysiology?")

'Pathophysiology refers to the study of the physiological changes that occur within an organism or system as a result of disease. In the context of diabetes, pathophysiology examines the underlying mechanisms and processes that lead to hyperglycemia. It involves understanding how various factors such as insulin production, resistance, and secretion contribute to glucose intolerance in diabetic individuals.'

# Indexing

## Count tokens with tiktoken

In [13]:
import tiktoken

question = "What is Pathophysiology?"
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

6

## Text Embedding Models

In [17]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Embed a single query
query_result = embedding_model.embed_query(question)  

# Embed multiple documents (convert to text first)
document_result = embedding_model.embed_documents([doc.page_content for doc in docs])

# Check the length of query embedding
print(len(query_result))  


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


384


## Cosine Similarity

In [20]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec2, vec1)  
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Compute norms for each document
    return dot_product / (norm_vec1 * norm_vec2)

# Convert document embeddings to NumPy array
document_embeddings = np.array(document_result)  

# Compute cosine similarity for each document
similarities = cosine_similarity(query_result, document_embeddings)

# Get the top 5 most similar documents
top_indices = np.argsort(similarities)[::-1][:5]  # Sort in descending order and take top 5

# Print results
print("Top 5 Similar Documents:")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Document {idx} - Similarity: {similarities[idx]:.4f}")


Top 5 Similar Documents:
1. Document 15 - Similarity: 0.4113
2. Document 11 - Similarity: 0.2673
3. Document 34 - Similarity: 0.2346
4. Document 13 - Similarity: 0.2219
5. Document 8 - Similarity: 0.2194


# Retrieval

In [63]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings  
from langchain_community.vectorstores import Chroma

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a Chroma vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

# Convert to a retriever
retriever = vectorstore.as_retriever()


In [None]:
docs = retriever.get_relevant_documents("What is Pathophysiology?")

In [53]:
len(docs)

4

# Generation

In [54]:
from langchain_huggingface import ChatHuggingFace
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [55]:
# LLM
llm = Ollama(model="llama3.2")

In [56]:
# Chain
chain = prompt | llm

In [57]:
# Run
chain.invoke({"context":docs,"question":"What is Pathophysiology?"})

'The text refers to "Pathophysiology" as the main topic for section 1.2 on two separate occasions, describing it as:\n\n- The "hall mark" of Type 1 diabetes.\n- A description of the main pathophysiologic defects in type 2 diabetes.\n\nIn both cases, Pathophysiology is described as the study of the normal functions and abnormal function of living organisms and their parts, particularly in relation to disease.'

In [58]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [59]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

# RAG chain

In [60]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Pathophysiology?")

'According to the document, Pathophysiology refers to:\n\n"Pathophysiology: Type 1 diabetes: Marked impairment of insulin production due to cellular-mediated autoimmune destruction of beta cells is the hall mark. Some of type 1 diabetes cases are of idiopathic in nature. Type 2 diabetes: Insulin resistance and 6-cell failure represent the main pathophysiologic defects in type 2 diabetes."\n\nAdditionally, it mentions that these eight pathways comprise "the ominous octet" for the development of glucose intolerance in type 2 diabetic individuals.'