In [2]:
# RAG implementation using LangChain with real world data


import os
import glob
import tiktoken
import numpy as np
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from pathlib import Path

from markitdown import MarkItDown

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from langchain_core.messages import SystemMessage, HumanMessage
import gradio as gr

In [3]:
# file input and output paths to convert and output in markdown format

input_dir = "knowledge-base-cihp"
output_dir="output"

In [4]:
# convert files to markdown

def convert_to_markdown(
    input_dir,
    output_dir=None,  # optional
    target_formats=(".docx", ".xlsx", ".pdf", ".pptx"),
):
    input_path = Path(input_dir)
    
    md = MarkItDown()

    for file_path in input_path.rglob("*"):
        if file_path.suffix in target_formats:
            try:
                result = md.convert(file_path)
            except Exception as e:
                print(f"✗ Error converting {file_path.name}: {e}")
                continue

            output_file = file_path.parent / f"{file_path.stem}{file_path.suffix}.md"
            
            output_file.write_text(result.markdown, encoding="utf-8")
            print(f"✓ Converted {file_path.name} → {output_file.name}")

In [5]:
convert_to_markdown(input_dir,output_dir)

✓ Converted Cool girls concept note for evaluation .docx → Cool girls concept note for evaluation .docx.md
✓ Converted Concept Note on Cool Girls Online Radio.docx → Concept Note on Cool Girls Online Radio.docx.md
✓ Converted employees.xlsx → employees.xlsx.md
✓ Converted NDR.pptx → NDR.pptx.md
✓ Converted CIHP Employee Policy Procedure Manual.pdf → CIHP Employee Policy Procedure Manual.pdf.md
✓ Converted SOP - Codes of Conduct.pdf → SOP - Codes of Conduct.pdf.md
✓ Converted SOP - HUMAN RESOURCES.pdf → SOP - HUMAN RESOURCES.pdf.md
✓ Converted SOP - IT Policy.pdf → SOP - IT Policy.pdf.md
✓ Converted SOP - Recruitment Policy Oct 2012.pdf → SOP - Recruitment Policy Oct 2012.pdf.md
✓ Converted SOP-PROCUREMENT GUIDELINES.pdf → SOP-PROCUREMENT GUIDELINES.pdf.md


In [6]:
MODEL = "gpt-4.1-nano"
DB_NAME = "cihp_vector_db"
load_dotenv(override=True)

True

In [7]:
# OpenAI API key

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


In [8]:
# How many characters in all the documents?

knowledge_base_path = input_dir + "/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

Found 10 files in the knowledge base
Total characters in knowledge base: 397,902


In [9]:
# How many tokens in all the documents?

encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Total tokens for {MODEL}: {token_count:,}")

Total tokens for gpt-4.1-nano: 97,528


In [10]:
# Load in everything in the knowledgebase using LangChain's loaders

folders = glob.glob(input_dir + "/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

Loaded 10 documents


In [11]:
documents[0]

Document(metadata={'source': 'knowledge-base-cihp\\coolgirls\\Cool girls concept note for evaluation .docx.md', 'doc_type': 'coolgirls'}, page_content="Outline and guide for short concept note on evaluation of new program interventions proposed for implementation\n\n# **Introduction**\n\n·\xa0\xa0\xa0\xa0\xa0\xa0\xa0*Include the problem statement –*\xa0*what is the program issue or gap to be addressed by the intervention? Highlight who is affected, where, and why is it important to focus on the identified group or sub-population?*\n\n*Sub-Saharan Africa is grappling with a significant HIV/AIDS burden, particularly among adolescent girls and young women (AGYW). Despite comprising only 10% of the total population, AGYW account for about 25% of all HIV infections. UNAIDS reported that 6,200 young women aged 15-24 are infected with HIV on a weekly basis worldwide, highlighting the urgent need for targeted interventions. Various factors contribute to the heightened vulnerability of AGYW to 

In [12]:
# Divide into chunks using the RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks")
print(f"First chunk:\n\n{chunks[1]}")

Divided into 537 chunks
First chunk:

page_content='*Sub-Saharan Africa is grappling with a significant HIV/AIDS burden, particularly among adolescent girls and young women (AGYW). Despite comprising only 10% of the total population, AGYW account for about 25% of all HIV infections. UNAIDS reported that 6,200 young women aged 15-24 are infected with HIV on a weekly basis worldwide, highlighting the urgent need for targeted interventions. Various factors contribute to the heightened vulnerability of AGYW to HIV, including socio-cultural barriers to comprehensive sexuality education, inter-generational and transactional sex practices, lack of schooling and economic empowerment, gender-based violence, and harmful traditional practices.*

* *What is the proposed solution? Has it been used before? If so, how (and where) was it used and how effective was it? (Useful to provide evidence for this - references). Why do you think the proposed intervention would be effective for the problem descr

In [13]:
# Pick an embedding model

#embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

if os.path.exists(DB_NAME):
    Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_NAME)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 537 documents


In [14]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 537 vectors with 3,072 dimensions in the vector store


In [15]:
# Visualizing  - Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange', 'darkblue'][['coolgirls', 'ndr', 'sop', 'employees', 'coolgirls-radio'].index(t)] for t in doc_types]

In [16]:
# Visualizing in 2D
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [17]:
# Visualzing in 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()

In [18]:
# Connect to Chroma; use Hugging Face all-MiniLM-L6-v2

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)

In [19]:
# Seting up the 2 key LangChain objects: retriever and llm
# Also, the temperature

retriever = vectorstore.as_retriever()
llm = ChatOpenAI(temperature=0, model_name=MODEL)

In [20]:
SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, friendly assistant representing the company CIHP.
You are chatting with a user about CIHP.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [23]:
def answer_question(question: str, history):
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context,history=history)
    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=question)])
    return response.content

In [24]:
# UI implementation with gradio

gr.ChatInterface(answer_question).launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


