In [22]:
#!chmod +x ../../../setup.sh
#!cd ../../.. && bash setup.sh OpenAI
#!pip install langchain-community
#!pip install beautifulsoup4
#!pip install langchain_openai

#!pip install langchain-postgres
#!pip install "unstructured[all-docs]"
#!pip install sentence-transformers

## RAG pipeline for summarizing internal documentation

Goal:
1. Load the confluence documentation which are in markdown format
2. Chunk and embed the confluence documentation
3. Generate vector embeddings and store in vectordb
4. Similarity search from vectordb

### Load documentation

#### Load and pre-process confluence markdown pages

**Why should we clean before generating embeddings?**

1. Remove Noise That Confuses the Model - LLMs are trained on well structured text, so these MD semantics doesn't make sense and also confuse which leads to in correct results
2. Avoid Wasting Tokens - Markdown/HTML tags count as tokens when embedding — and most of the time they don’t add value.
3. Improve Retrieval Quality - If you chunk and embed content with markdown noise, your RAG system will 
- Return less relevant results
- Match on formatting, not concepts (e.g., # instead of “Login Flow”)
- Generate awkward completions
4. Better User Experience - Final response with markdown artifacts will confuse users and looks unpolished
5. Avoid Chunking Mistakes - Markdown headers, code blocks, and lists can interfere with chunk splitting. Cleaning ensures each chunk is text-rich and meaningful.

In [1]:
import re

# Function to clean markdown content
def clean_markdown(text: str) -> str:
    text = re.sub(r'^#+\s*', 'Section: ', text, flags=re.MULTILINE)  # headers
    text = re.sub(r'[*_`]+', '', text)  # bold, italics, inline code
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)  # links
    text = re.sub(r'^\s*[-*+]\s+', '- ', text, flags=re.MULTILINE)  # bullet points
    text = re.sub(r'^\s*\d+\.\s+', '- ', text, flags=re.MULTILINE)  # numbered lists
    text = re.sub(r'\n{2,}', '\n\n', text)  # extra newlines
    return text.strip()

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def process_md_file(file_path: str) -> list[Document]:
    with open(file_path, "r", encoding="utf-8") as f:
        raw_md = f.read()

    cleaned_text = clean_markdown(raw_md)

    # Step 3: Split using RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "Section:", "\n", " ", ""]
    )
    chunks = splitter.create_documents([cleaned_text])

    # Step 4: Attach filename metadata
    for doc in chunks:
        doc.metadata["source"] = file_path

    return chunks

In [3]:
from pathlib import Path

def process_md_folder(folder_path: str) -> list[Document]:
    all_docs = []
    for md_file in Path(folder_path).rglob("*.md"):
        docs = process_md_file(str(md_file))
        all_docs.extend(docs)
    return all_docs

In [4]:
confluence_docs = process_md_folder("../../../data/confluence")
confluence_docs

[Document(metadata={'source': '../../../data/confluence/authentication_overview.md'}, page_content='markdown\nSection: Authentication Overview\n\nThis document outlines the authentication mechanisms employed within the platform.\n\nSection: Authentication Methods\n\n   Password-Based Authentication: Utilizes standard username/password credentials. Password hashing implemented using bcrypt with a salt factor of 12. Password complexity requirements enforced (minimum 8 characters, one uppercase, one lowercase, one number, one special character).'),
 Document(metadata={'source': '../../../data/confluence/authentication_overview.md'}, page_content='Multi-Factor Authentication (MFA): Supports Time-based One-Time Password (TOTP) via Google Authenticator or similar. MFA is optional but strongly encouraged for privileged accounts. Recovery codes are generated and stored securely upon MFA enablement.'),
 Document(metadata={'source': '../../../data/confluence/authentication_overview.md'}, page_co

#### Load and pre-process proprofs HTML pages

- Parses full HTML
- Extracts meaningful sections by header tags (h1, h2, h3)
- Skips script, style tags etc.
- Preserves document hierarchy
- Adds metadata with header path

You may need to write your own HTML preprocessor by using beautifulSoup if the HTML code has lots of custom tags specific to your app

In [5]:
from langchain.text_splitter import HTMLHeaderTextSplitter
from langchain.schema import Document

def process_html_file(file_path: str) -> list[Document]:
    with open(file_path, "r", encoding="utf-8") as f:
        raw_md = f.read()

    headers_to_split_on = [
        ("h1", "Section"),
        ("h2", "Subsection"),
        ("h3", "Detail")
    ]

    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    raw_chunks = splitter.split_text_from_file(file_path)
    
    proprofs_docs = [
        Document(page_content=chunk.page_content, metadata={"source": file_path})
        for chunk in raw_chunks
    ]

    return proprofs_docs

In [6]:
from pathlib import Path

def process_html_folder(folder_path: str) -> list[Document]:
    all_docs = []
    for md_file in Path(folder_path).rglob("*.html"):
        docs = process_html_file(str(md_file))
        all_docs.extend(docs)
    return all_docs

In [7]:
proprofs_docs = process_html_folder("../../../data/proprofs")
proprofs_docs

[Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content="We're Working Hard to Keep Things Running Smoothly"),
 Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content="We know it's important that our service is always available when you need it.  Here's how we ensure things run smoothly:"),
 Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content='Keeping a Close Watch'),
 Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content="We have a team that constantly monitors our systems for any problems. We're alerted immediately if anything goes wrong."),
 Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content='Fixing Problems Quickly'),
 Document(metadata={'source': '../../../data/proprofs/user_monitoring.html'}, page_content='If we detect an issue, our team gets to work right away to fix it. Our goal is to minimize any 

In [8]:
all_docs = confluence_docs + proprofs_docs

In [14]:
question = "how to use feature flags?"

### Generate Text Embeddings, Save to Vector DB and Similarity Search

- Using OpenAI : Generate embeddings is not free, you need to have funded openai account and configure OpenAI key beforehand..
- Using HuggingFace: Free

If you are planning to use HF in production then you need to manage deployment and infra costs, but managed services like OpenAI you pay as you use and infra and deployment cost is included.

In [9]:
use_embeddings_model_provider = "OpenAI"

#### Using OpenAI - with 1536 dimensions

In [10]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
from langchain_openai import OpenAIEmbeddings
if (use_embeddings_model_provider == 'OpenAI'):
    openai_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    openai_model

In [12]:
from langchain_chroma import Chroma
from langchain_core.documents import Document

if (use_embeddings_model_provider == 'OpenAI'):

    vector_store = Chroma.from_documents(
        documents=all_docs,
        embedding=openai_model,
        collection_name="all_docs",
        persist_directory="../../chromadb_open_ai"
    )

In [15]:
if (use_embeddings_model_provider == 'OpenAI'):
    results = vector_store.similarity_search(question, k=10)

    seen_sources = set()
    unique_docs = []

    for doc in results:
        source = doc.metadata.get("source")
        if source and source not in seen_sources:
            seen_sources.add(source)
            unique_docs.append(doc)

#### Using HuggingFace - with 384 dimensions

In [16]:
from langchain.embeddings import HuggingFaceEmbeddings
if (use_embeddings_model_provider == 'HuggingFace'):
    hf_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

In [17]:
from langchain_chroma import Chroma
from langchain_core.documents import Document

if (use_embeddings_model_provider == 'HuggingFace'):
    vector_store = Chroma.from_documents(
        documents=all_docs,
        embedding=hf_model,
        collection_name="all_docs",
        persist_directory="../../chromadb_huggingface"
    )
vector_store

<langchain_chroma.vectorstores.Chroma at 0x115940050>

In [18]:
if (use_embeddings_model_provider == 'HuggingFace'):
    results = vector_store.similarity_search(question, k=10)

    seen_sources = set()
    unique_docs = []

    for doc in results:
        source = doc.metadata.get("source")
        if source and source not in seen_sources:
            seen_sources.add(source)
            unique_docs.append(doc)

In [None]:
unique_docs

### Ask GenAI to Summarize

In [19]:
from langchain.prompts import ChatPromptTemplate

template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that summarizes internal documentation to answer user questions. Can you summarize in bullet points?"),
    ("human", "Context:\n{context}"),
    ("human", "Question:\n{question}")
])


In [20]:
# Build context
context = "\n".join([doc.page_content for doc in unique_docs])


In [21]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0)  # or your preferred LLM

# Build prompt
messages = template.format_messages(context=context, question=question)  # limit if needed

# Run inference
response = llm.invoke(messages)

print(response.content)


  llm = ChatOpenAI(temperature=0)  # or your preferred LLM


- Feature flags (or feature toggles) allow enabling or disabling features in production without deploying new code.
- To use feature flags:
  - Test new features in production with a limited number of users.
  - Roll out features gradually to reduce risk.
  - Quickly disable features if they cause problems.
  - Experiment with different feature implementations.
- Start using key features by testing users with small changes before releasing the feature live, similar to a controlled real-world test.
