# Environment Setup

In [39]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Ensure the environment variables are set
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

if not langchain_api_key:
    raise ValueError("LANGCHAIN_API_KEY is not set in the environment variables.")
if not huggingface_api_key:
    raise ValueError("HUGGINGFACE_API_KEY is not set in the environment variables.")

# Set environment variables for the application
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [40]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Load Document

In [41]:
# Load Documents (use PyPDFLoader for PDF)
file_path = r"C:\Users\User\Desktop\NSU\CSE299 Materials\LLM\Dataset\Diabetes_Care_BADAS_guideline2019-3.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

docs[0].page_content[:1000]

'DIABETES CARE \nBADAS Guideline 2019 \n          \n  \n   \n  \n   P|) \nDAS GUELINE ON Man \nDELIT IGEMEN \n  \nA Joint Initiative of \nDiabetic Association of Bangladesh \nNCDC Program, Directorate General of Health Services'

# Splitting

In [42]:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

for i, chunk in enumerate(splits[:3]):  # Show the first 3 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content[:1000])  # Print the first 1000 characters of the chunk
    print("\n" + "-"*70 + "\n")  # Separator between chunks


--- Chunk 1 ---
DIABETES CARE 
BADAS Guideline 2019 
          
  
   
  
   P|) 
DAS GUELINE ON Man 
DELIT IGEMEN 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services

----------------------------------------------------------------------


--- Chunk 2 ---
DIABETES CARE 
BADAS Guideline 2019 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services 
  
Diabetes Care: BADAS Guideline 2019 HEI! 1

----------------------------------------------------------------------


--- Chunk 3 ---
DIABETES CARE: BADAS GUIDELINE 2019 
Convener: Prof A K Azad Khan 
Chairman: Prof Hajera Mahtab 
Members of the steering committee 
Prof Dr AHM Enayet Hossain 
Prof Akhtar Hussain 
Prof Zafar Anmed Latif 
Prof Tofail Ahmed 
Prof Laique Ahmed Khan 
Prof Nazrul Islam Siddiqui 
Prof Md Hafizur Rahman 
Prof Abdus Saleque Mollah 
Prof Md Farid Uddin 
Prof M A Jalil Ansary 
Prof Dr MA Samad 


# Count token

In [43]:
import tiktoken

# Function to count tokens in a string using a specified encoding
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Use the same text_splitter from before (already done)
# Now apply tokenization to each chunk

encoding_name = "cl100k_base"  # The encoding used by OpenAI models like GPT-4, GPT-3.5

# Tokenize each chunk and print the number of tokens
for i, chunk in enumerate(splits[:3]):  # Show the first 3 chunks as an example
    print(f"\n--- Chunk {i+1} ---")
    chunk_text = chunk.page_content  # Get the text content of the chunk
    num_tokens = num_tokens_from_string(chunk_text, encoding_name)  # Tokenize and count tokens
    print(f"Text snippet (first 1000 characters):\n{chunk_text[:1000]}")  # Show part of the chunk
    print(f"Number of tokens in this chunk: {num_tokens}")
    print("\n" + "-"*70 + "\n")  # Separator between chunks



--- Chunk 1 ---
Text snippet (first 1000 characters):
DIABETES CARE 
BADAS Guideline 2019 
          
  
   
  
   P|) 
DAS GUELINE ON Man 
DELIT IGEMEN 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services
Number of tokens in this chunk: 57

----------------------------------------------------------------------


--- Chunk 2 ---
Text snippet (first 1000 characters):
DIABETES CARE 
BADAS Guideline 2019 
  
A Joint Initiative of 
Diabetic Association of Bangladesh 
NCDC Program, Directorate General of Health Services 
  
Diabetes Care: BADAS Guideline 2019 HEI! 1
Number of tokens in this chunk: 51

----------------------------------------------------------------------


--- Chunk 3 ---
Text snippet (first 1000 characters):
DIABETES CARE: BADAS GUIDELINE 2019 
Convener: Prof A K Azad Khan 
Chairman: Prof Hajera Mahtab 
Members of the steering committee 
Prof Dr AHM Enayet Hossain 
Prof Akhtar Hussain 
Prof Zafar Anmed Latif 


# Sentence-Transformers Models (Hugging Face)

## all-MiniLM-L6-v2

In [44]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import tiktoken

question = "What is Pathophysiology?"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# Embed a single query
query_result = embedding_model.embed_query(question)  

# Embed multiple documents (convert to text first)
document_result = embedding_model.embed_documents([doc.page_content for doc in docs])

# Check the length of query embedding
print(len(query_result))  



384


# Cosine Similarity for all-MiniLM-L6-v2

In [45]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec2, vec1)  # (79, 384) @ (384,) → (79,)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Compute norms for each document
    return dot_product / (norm_vec1 * norm_vec2)

# Convert document embeddings to NumPy array
document_embeddings = np.array(document_result)  # Shape: (79, 384)

# Compute cosine similarity for each document
similarities = cosine_similarity(query_result, document_embeddings)

# Get the top 5 most similar documents
top_indices = np.argsort(similarities)[::-1][:5]  # Sort in descending order and take top 5

# Print results
print("Top 5 Similar Documents:")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Document {idx} - Similarity: {similarities[idx]:.4f}")


Top 5 Similar Documents:
1. Document 15 - Similarity: 0.4113
2. Document 11 - Similarity: 0.2673
3. Document 34 - Similarity: 0.2346
4. Document 13 - Similarity: 0.2219
5. Document 8 - Similarity: 0.2194


## paraphrase-MiniLM-L6-v2

In [46]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import tiktoken

question = "What is Pathophysiology?"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")


# Embed a single query
query_result = embedding_model.embed_query(question)  

# Embed multiple documents (convert to text first)
document_result = embedding_model.embed_documents([doc.page_content for doc in docs])

# Check the length of query embedding
print(len(query_result))  



384


# Cosine Similarity for paraphrase-MiniLM-L6-v2

In [47]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec2, vec1)  # (79, 384) @ (384,) → (79,)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Compute norms for each document
    return dot_product / (norm_vec1 * norm_vec2)

# Convert document embeddings to NumPy array
document_embeddings = np.array(document_result)  # Shape: (79, 384)

# Compute cosine similarity for each document
similarities = cosine_similarity(query_result, document_embeddings)

# Get the top 5 most similar documents
top_indices = np.argsort(similarities)[::-1][:5]  # Sort in descending order and take top 5

# Print results
print("Top 5 Similar Documents:")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Document {idx} - Similarity: {similarities[idx]:.4f}")


Top 5 Similar Documents:
1. Document 11 - Similarity: 0.3470
2. Document 63 - Similarity: 0.3361
3. Document 15 - Similarity: 0.3287
4. Document 32 - Similarity: 0.2957
5. Document 35 - Similarity: 0.2831


# OpenAI-Compatible Open-Source Embeddings

## BAAI/bge-small-en

In [48]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import tiktoken

question = "What is Pathophysiology?"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")


# Embed a single query
query_result = embedding_model.embed_query(question)  

# Embed multiple documents (convert to text first)
document_result = embedding_model.embed_documents([doc.page_content for doc in docs])

# Check the length of query embedding
print(len(query_result))  



384


# Cosine Similarity for BAAI/bge-small-en

In [49]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec2, vec1)  # (79, 384) @ (384,) → (79,)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Compute norms for each document
    return dot_product / (norm_vec1 * norm_vec2)

# Convert document embeddings to NumPy array
document_embeddings = np.array(document_result)  # Shape: (79, 384)

# Compute cosine similarity for each document
similarities = cosine_similarity(query_result, document_embeddings)

# Get the top 5 most similar documents
top_indices = np.argsort(similarities)[::-1][:5]  # Sort in descending order and take top 5

# Print results
print("Top 5 Similar Documents:")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Document {idx} - Similarity: {similarities[idx]:.4f}")


Top 5 Similar Documents:
1. Document 9 - Similarity: 0.8366
2. Document 15 - Similarity: 0.8105
3. Document 10 - Similarity: 0.8023
4. Document 11 - Similarity: 0.8023
5. Document 46 - Similarity: 0.7908


## intfloat/e5-small-v2

In [50]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import tiktoken

question = "What is Pathophysiology?"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")


# Embed a single query
query_result = embedding_model.embed_query(question)  

# Embed multiple documents (convert to text first)
document_result = embedding_model.embed_documents([doc.page_content for doc in docs])

# Check the length of query embedding
print(len(query_result))  



384


# Cosine Similarity for intfloat/e5-small-v2

In [51]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec2, vec1)  # (79, 384) @ (384,) → (79,)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Compute norms for each document
    return dot_product / (norm_vec1 * norm_vec2)

# Convert document embeddings to NumPy array
document_embeddings = np.array(document_result)  # Shape: (79, 384)

# Compute cosine similarity for each document
similarities = cosine_similarity(query_result, document_embeddings)

# Get the top 5 most similar documents
top_indices = np.argsort(similarities)[::-1][:5]  # Sort in descending order and take top 5

# Print results
print("Top 5 Similar Documents:")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Document {idx} - Similarity: {similarities[idx]:.4f}")


Top 5 Similar Documents:
1. Document 9 - Similarity: 0.8518
2. Document 11 - Similarity: 0.8379
3. Document 15 - Similarity: 0.8342
4. Document 46 - Similarity: 0.8193
5. Document 59 - Similarity: 0.8179
