In [1]:
# installing everythin inside requirements -> also need it for deploying into streamlit 


!pip install -r requirements.txt




In [2]:
from dotenv import load_dotenv
import os


load_dotenv()
api_key_google = os.getenv("GOOGLE_API_KEY")

In [3]:
#cloned  mkdocs from terminal

In [4]:
pip install google-genai


Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install langchain_text_splitters



In [8]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from google import genai
from langchain_text_splitters import RecursiveCharacterTextSplitter

import glob
from langchain_core.documents import Document
import pickle


import time
from tqdm import trange

In [9]:
client = genai.Client(api_key=api_key_google)

In [11]:
pip install google-generativeai

Collecting google-generativeai
  Using cached google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Using cached google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Using cached google_api_python_client-2.187.0-py3-none-any.whl.metadata (7.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Using cached proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf (from google-generativeai)
  Using cached protobuf-5.29.5-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client->google-generativeai)
  Using cached httplib2-0.31.0-py3-none-any.whl.metadata (2.2 kB)
Collecti

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.26.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.5 which is incompatible.


In [13]:
# chroma & embedding func
chroma_client = chromadb.PersistentClient(path="mkdocs_db/")

google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=api_key_google,
    model_name="models/text-embedding-004"
,  
)

collection = chroma_client.get_or_create_collection(
    name="MkDocs",                
    embedding_function=google_ef  
)

In [14]:
# text splitter 
# Used RecursiveCharacter from langchain -> Works well with Markdown docs (MkDocs) -> MkDocs docs are structured with paragraphs


splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=500,
    chunk_overlap=50,
)

MKDOCS_DOCS_ROOT = "mkdocs/docs"


md_files = glob.glob(f"{MKDOCS_DOCS_ROOT}/**/*.md", recursive=True)
print(f"Found {len(md_files)} markdown files.")



Found 19 markdown files.


In [15]:
# 2- Cleaning 
# strip leading/trailing
# remove trailing spaces


def clean_markdown(text: str) -> str:
    cleaned = text.strip()
    cleaned = "\n".join(line.rstrip() for line in cleaned.splitlines())
    while "\n\n\n" in cleaned:
        cleaned = cleaned.replace("\n\n\n", "\n\n")
    return cleaned

# Build LangChain Documents with cleaning applied
docs = []

for path in md_files:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
        text = clean_markdown(raw)   # âœ… cleaning here

    rel_path = os.path.relpath(path, MKDOCS_DOCS_ROOT)
    docs.append(
        Document(
            page_content=text,
            metadata={
                "source": "mkdocs",
                "path": rel_path,
            }
        )
    )

print(f"Loaded {len(docs)} cleaned documents")

Loaded 19 cleaned documents


In [16]:
# Chunking
from tqdm import tqdm

chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks.")

Split into 764 chunks.


In [17]:
# texts , metadata, ID 
all_texts = []
all_metadatas = []
all_ids = []

for idx, c in enumerate(chunks):
    all_texts.append(c.page_content)
    all_metadatas.append({
        "source": c.metadata.get("source", "mkdocs"),
        "path": c.metadata.get("path", ""),
    })
    all_ids.append(f"mkdocs-{idx}")

print("  texts:", len(all_texts))
print("  metadatas:", len(all_metadatas))
print("  ids:", len(all_ids))

  texts: 764
  metadatas: 764
  ids: 764


In [18]:
# in MedicalGPT it saved it into pickle 

with open("mkdocs_split_data.pkl", "wb") as f:
    pickle.dump((all_texts, all_metadatas, all_ids), f)


In [19]:
# MkDocs chunks into Chroma

BATCH_SIZE = 32
total = len(all_texts)

for i in trange(0, total, BATCH_SIZE, desc="Indexing MkDocs"):
    texts = all_texts[i:i+BATCH_SIZE]
    metadatas = all_metadatas[i:i+BATCH_SIZE]
    ids = all_ids[i:i+BATCH_SIZE]

    # Check existing IDs (optional but nice)
    try:
        existing = collection.get(ids=ids)
        existing_ids = set(existing["ids"])
    except Exception:
        existing_ids = set()

    filtered_texts = []
    filtered_metas = []
    filtered_ids = []

    for t, m, id_ in zip(texts, metadatas, ids):
        if id_ not in existing_ids:
            filtered_texts.append(t)
            filtered_metas.append(m)
            filtered_ids.append(id_)

    if not filtered_ids:
        continue

    try:
        collection.add(
            documents=filtered_texts,
            metadatas=filtered_metas,
            ids=filtered_ids,
        )
    except Exception as e:
        print(f"Error on batch {i}: {e}. Retrying in 30s...")
        time.sleep(30)
        try:
            collection.add(
                documents=filtered_texts,
                metadatas=filtered_metas,
                ids=filtered_ids,
            )
        except Exception as e2:
            print(f"Permanent failure on batch {i}: {e2}")


Indexing MkDocs: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 24/24 [00:00<00:00, 220.20it/s]


In [20]:
query = "How do I deploy MkDocs to GitHub Pages?"
result = collection.query(
    query_texts=[query],
    n_results=3,
)

print("=== Retrieved texts (truncated) ===")
for doc, meta in zip(result["documents"][0], result["metadatas"][0]):
    print("Source path:", meta.get("path"))
    print(doc[:300], "\n---\n")

=== Retrieved texts (truncated) ===
Source path: user-guide\deploying-your-docs.md
```sh
mkdocs gh-deploy
```

That's it! Behind the scenes, MkDocs will build your docs and use the
[ghp-import] tool to commit them to the `gh-pages` branch and push the
`gh-pages` branch to GitHub.

Use `mkdocs gh-deploy --help` to get a full list of options available for the
`gh-deploy` command. 
---

Source path: user-guide\deploying-your-docs.md
After making and verifying updates to your project you need to change
directories to the `orgname.github.io` repository and call the
`mkdocs gh-deploy` command from there:

```sh
cd ../orgname.github.io/
mkdocs gh-deploy --config-file ../my-project/mkdocs.yml --remote-branch master
``` 
---

Source path: user-guide\deploying-your-docs.md
### Project Pages

Project Pages sites are simpler as the site files get deployed to a branch
within the project repository (`gh-pages` by default). After you `checkout` the
primary working branch (usually `master`) of the git

In [21]:
# === RAG QA setup ===

SYSTEM_PROMPT = """
You are an AI assistant specialized ONLY in answering questions about MkDocs,
the static site generator for project documentation.

Rules:
- Use ONLY the information given in the retrieved context from the MkDocs docs.
- If the answer is not clearly supported by the context, say:
  "I don't know based on the MkDocs documentation I have."
- Do NOT answer questions about anything outside MkDocs (no general chat, no other frameworks).
- If the user asks about something unrelated to MkDocs, politely refuse and say that you only support MkDocs documentation questions.
- Prefer short, clear, step-by-step explanations when appropriate.
"""

def build_context_block(result):
    docs = result["documents"][0]
    metas = result["metadatas"][0]

    context_str = ""
    for i, (text, meta) in enumerate(zip(docs, metas), start=1):
        path = meta.get("path", "")
        context_str += f"Source {i} (path: {path}):\n{text}\n\n"
    return context_str

def answer_question(query: str, k: int = 4):
    """
    k = 4 neighbors:
    - Enough different chunks to cover related parts of the same topic.
    - Not too many to overwhelm the model with irrelevant text.
    """

    # 1) Retrieve from Chroma
    result = collection.query(
        query_texts=[query],
        n_results=k,
    )

    context_str = build_context_block(result)

    # 2) Human prompt with context and user question
    human_prompt = f"""
You are given some context taken from the official MkDocs documentation:

{context_str}

User question: {query}

Instructions:
- Answer ONLY using the context above.
- If the context is not enough, say you don't know.
- If the question is not about MkDocs, refuse and say you only answer MkDocs questions.
- When possible, mention which source paths you used.
"""

    # 3) Call Gemini chat model
    response = client.models.generate_content(
        model="gemini-2.5-flash",   # or another chat model you have access to
        contents=[SYSTEM_PROMPT, human_prompt],
    )

    answer = response.text
    source_paths = [m.get("path", "") for m in result["metadatas"][0]]

    return answer, source_paths


In [22]:
q = "How do I deploy my MkDocs site to GitHub Pages?"
answer, paths = answer_question(q, k=5)

print("QUESTION:")
print(q)
print("\nANSWER:")
print(answer)
print("\nSOURCES:")
for p in paths:
    print("-", p)


QUESTION:
How do I deploy my MkDocs site to GitHub Pages?

ANSWER:
To deploy your MkDocs site to GitHub Pages:

1.  **Checkout the primary working branch** of the git repository where you maintain your source documentation (usually `master`). (Source 2)
2.  **Run the `mkdocs gh-deploy` command**. (Source 1, Source 2)

This command will build your docs, commit them to the `gh-pages` branch, and push that branch to GitHub. (Source 1)

For user or organization pages, you might need to navigate to the `orgname.github.io` repository and use a command like:

```sh
cd ../orgname.github.io/
mkdocs gh-deploy --config-file ../my-project/mkdocs.yml --remote-branch master
```
(Source 3)

You can use `mkdocs gh-deploy --help` to see all available options for the command. (Source 1)

SOURCES:
- user-guide\deploying-your-docs.md
- user-guide\deploying-your-docs.md
- user-guide\deploying-your-docs.md
- getting-started.md
- user-guide\deploying-your-docs.md


In [23]:
pip install google-genai

Note: you may need to restart the kernel to use updated packages.


In [24]:
%%writefile rag.py


import os

import streamlit as st
from dotenv import load_dotenv

import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from google import genai


# === 1. Load API key ===
load_dotenv()
api_key_google = os.getenv("GOOGLE_API_KEY")

if not api_key_google:
    raise ValueError("GOOGLE_API_KEY not found in environment. Check your .env file.")

client = genai.Client(api_key=api_key_google)

# === 2. Chroma client & collection (load existing index) ===
chroma_client = chromadb.PersistentClient(path="mkdocs_db/")

google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=api_key_google,
    model_name="models/text-embedding-004",
)

collection = chroma_client.get_or_create_collection(
    name="MkDocs",
    embedding_function=google_ef,
)

# === 3. RAG logic (same as in notebook) ===

SYSTEM_PROMPT = """
You are an AI assistant specialized ONLY in answering questions about MkDocs,
the static site generator for project documentation.

Rules:
- Use ONLY the information given in the retrieved context from the MkDocs docs.
- If the answer is not clearly supported by the context, say:
  "I don't know based on the MkDocs documentation I have."
- Do NOT answer questions about anything outside MkDocs (no general chat, no other frameworks).
- If the user asks about something unrelated to MkDocs, politely refuse and say that you only support MkDocs documentation questions.
- Prefer short, clear, step-by-step explanations when appropriate.
"""

def build_context_block(result):
    docs = result["documents"][0]
    metas = result["metadatas"][0]

    context_str = ""
    for i, (text, meta) in enumerate(zip(docs, metas), start=1):
        path = meta.get("path", "")
        context_str += f"Source {i} (path: {path}):\n{text}\n\n"
    return context_str

def answer_question(query: str, k: int = 4):
    # 1) retrieve from Chroma
    result = collection.query(
        query_texts=[query],
        n_results=k,
    )

    context_str = build_context_block(result)

    # 2) human prompt
    human_prompt = f"""
You are given some context taken from the official MkDocs documentation:

{context_str}

User question: {query}

Instructions:
- Answer ONLY using the context above.
- If the context is not enough, say you don't know.
- If the question is not about MkDocs, refuse and say you only answer MkDocs questions.
- When possible, mention which source paths you used.
"""

    # 3) call Gemini
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[SYSTEM_PROMPT, human_prompt],
    )

    answer = response.text
    source_paths = [m.get("path", "") for m in result["metadatas"][0]]

    # also return the raw chunks for UI if you like
    docs = result["documents"][0]

    return answer, list(zip(source_paths, docs))


# === 4. Streamlit UI ===

st.set_page_config(page_title="MkDocs RAG Assistant", page_icon="ðŸ“š")

st.title("ðŸ“š MkDocs RAG Assistant")
st.write("Ask questions about MkDocs documentation. The assistant only answers using the official MkDocs docs embedded in ChromaDB.")

query = st.text_area("Your question about MkDocs:", height=80, placeholder="Example: How do I deploy MkDocs to GitHub Pages?")

col1, col2 = st.columns(2)
with col1:
    k = st.slider("Number of neighbors (k)", min_value=1, max_value=8, value=4, step=1)
with col2:
    run_button = st.button("Ask", type="primary")

if run_button and query.strip():
    with st.spinner("Thinking..."):
        try:
            answer, sources = answer_question(query.strip(), k=k)
        except Exception as e:
            st.error(f"Error while answering: {e}")
        else:
            st.subheader("Answer")
            st.write(answer)

            st.subheader("Sources from MkDocs docs")
            for i, (path, text) in enumerate(sources, start=1):
                with st.expander(f"Source {i}: {path}"):
                    st.write(text)


Overwriting rag.py
