In [36]:
import os
import re
import json
import yaml
import sys

def parse_k8s_doc(file_path: str) -> dict:
    """
    Parse a Kubernetes doc file into a JSON-friendly dict.
    YAML front matter is optional.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    metadata = {}
    body = content

    # Detect YAML front matter
    parts = re.split(r"^---\s*$", content, flags=re.MULTILINE)
    if len(parts) >= 3:
        yaml_block = parts[1]
        body = parts[2]
        try:
            metadata = yaml.safe_load(yaml_block) or {}
        except Exception:
            metadata = {}

    # Clean markdown body (remove Hugo shortcodes like {{< ... >}})
    cleaned_body = re.sub(r"\{\{<.*?>\}\}", "", body)
    cleaned_body = re.sub(r"\{\{%.*?%\}\}", "", cleaned_body)
    cleaned_body = re.sub(r"\{\{<\s*/.*?>\}\}", "", cleaned_body)
    cleaned_body = cleaned_body.strip()

    # Build structured JSON
    doc_json = {
        "title": metadata.get("title") or os.path.basename(file_path),
        "reviewers": metadata.get("reviewers", []),
        "api_metadata": metadata.get("api_metadata", []),
        "feature": metadata.get("feature", {}),
        "description": metadata.get("description", ""),
        "content_type": metadata.get("content_type", ""),
        "weight": metadata.get("weight", ""),
        "body": cleaned_body,
        "source_file": os.path.basename(file_path),
    }

    return doc_json


def ingest_directory(input_dir: str, output_file: str):
    """
    Ingest all Kubernetes doc files in a directory and write them into one JSON file.
    """
    all_docs = []

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith((".md", ".markdown", ".txt")):
                try:
                    doc = parse_k8s_doc(os.path.join(root, file))
                    all_docs.append(doc)
                except Exception as e:
                    print(f"⚠️ Skipping {file}: {e}")

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_docs, f, indent=2, ensure_ascii=False)

    print(f"✅ Ingested {len(all_docs)} documents into {output_file}")


def running_in_jupyter() -> bool:
    """Check if the script is running inside a Jupyter notebook."""
    try:
        from IPython import get_ipython
        if "IPKernelApp" in get_ipython().config:
            return True
    except Exception:
        return False
    return False


if __name__ == "__main__":
    if running_in_jupyter():
        # Default paths for notebook runs
        input_dir = "../../glossary/"
        output_file = "../data_gloss/k8s_docs.json"
        ingest_directory(input_dir, output_file)
    else:
        import argparse

        parser = argparse.ArgumentParser(description="Ingest Kubernetes docs into JSON")
        parser.add_argument("--input_dir", required=True, help="../../glossary/")
        parser.add_argument("--output_file", required=True, help="/Users/sindhu/Documents/llm-zoomcamp/KubeRAG/k8s-assistant/data_gloss/")

        args = parser.parse_args()
        ingest_directory(args.input_dir, args.output_file)


✅ Ingested 159 documents into ../data_gloss/k8s_docs.json


In [37]:
import json
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure text splitter (approx. ~800 chars per chunk, with overlap for context)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)

def transform_doc(doc: dict) -> dict:
    """Clean up one doc: drop unused fields and chunk body."""
    body = doc.get("body", "")

    # Clean out Hugo/shortcode tags (defensive, in case not cleaned before)
    cleaned_body = re.sub(r"\{\{<.*?>\}\}", "", body)
    cleaned_body = re.sub(r"\{\{%.*?%\}\}", "", cleaned_body)
    cleaned_body = re.sub(r"\{\{<\s*/.*?>\}\}", "", cleaned_body)
    cleaned_body = cleaned_body.strip()

    # Split into chunks
    chunks = splitter.split_text(cleaned_body) if cleaned_body else []

    return {
        "title": doc.get("title", ""),
        "content_type": doc.get("content_type", ""),
        "source_file": doc.get("source_file", ""),
        "weight": doc.get("weight", ""),
        "chunks": chunks
    }

def transform_json(input_file: str, output_file: str):
    """Read JSON, transform all docs, and save new JSON."""
    with open(input_file, "r", encoding="utf-8") as f:
        docs = json.load(f)

    transformed = [transform_doc(doc) for doc in docs]

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(transformed, f, indent=2, ensure_ascii=False)

    print(f"✅ Transformed {len(transformed)} docs into {output_file}")


if __name__ == "__main__":
    # Default paths for notebook runs
    input_dir = "../data_gloss/k8s_docs.json"
    output_file = "../data_gloss/k8s_docs_parsed.json"
    transform_json(input_dir, output_file)

✅ Transformed 159 docs into ../data_gloss/k8s_docs_parsed.json


In [26]:
import json
from pathlib import Path

# Paths
input_file = Path("../data_sample/k8s_docs_parsed.json")    # your input JSON file (list of dicts)
output_file = Path("../data_sample/k8s_docs_cleaned2.json") # output file

# Read the input list of dicts
with input_file.open("r", encoding="utf-8") as f:
    data = json.load(f)

# Convert 'chunks' from list to string for each dict
for item in data:
    if "chunks" in item and isinstance(item["chunks"], list):
        item["chunks"] = "\n\n".join(item["chunks"])

# Write the updated list back as a single JSON array
with output_file.open("w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ Updated JSON saved to {output_file}")


✅ Updated JSON saved to ../data_sample/k8s_docs_cleaned2.json


In [27]:
import json
import re
import textwrap

MAX_CHUNK_SIZE = 800  # configurable

def process_text(text: str) -> str:
    """Replace escaped \n with real newlines."""
    text = text.replace("\\n\\n", "\n\n")
    text = text.replace("\\n", "\n")
    return text

def smart_chunk(text: str, max_size: int = MAX_CHUNK_SIZE):
    """
    Group text into semantically meaningful chunks:
    - Headings, lists, and code blocks stay grouped.
    - Wrap long sections if too big.
    """
    lines = text.splitlines()
    chunks, current_chunk = [], []

    for line in lines:
        stripped = line.strip()

        # Start a new chunk if line is a heading
        if re.match(r"^#{1,6}\s", stripped) or re.match(r"^[A-Z].*$", stripped) and len(stripped.split()) < 10:
            if current_chunk:
                chunks.append("\n".join(current_chunk).strip())
                current_chunk = []
            current_chunk.append(stripped)

        # Lists and numbered items stay with their block
        elif re.match(r"^(\d+\.|\-)\s", stripped):
            current_chunk.append(stripped)

        # Code block or indented text
        elif stripped.startswith("```") or line.startswith("    ") or line.startswith("\t"):
            current_chunk.append(line)

        # Normal text
        else:
            current_chunk.append(line)

    if current_chunk:
        chunks.append("\n".join(current_chunk).strip())

    # Now enforce max_size
    final_chunks = []
    for chunk in chunks:
        if len(chunk) > max_size:
            wrapped = textwrap.wrap(chunk, width=max_size, break_long_words=False, break_on_hyphens=False)
            final_chunks.extend(wrapped)
        else:
            final_chunks.append(chunk)

    return final_chunks

def transform_records(input_file: str, output_file: str):
    with open(input_file, "r", encoding="utf-8") as f:
        records = json.load(f)

    for record in records:
        raw_text = record.get("chunks", "")
        processed_text = process_text(raw_text)
        split_chunks = smart_chunk(processed_text, MAX_CHUNK_SIZE)

        record["chunks"] = [
            {"id": f"{record['source_file']}_{i}", "text": chunk}
            for i, chunk in enumerate(split_chunks)
        ]

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    transform_records("../data_sample/k8s_docs_cleaned2.json", "../data_sample/k8s_docs_chunked.json")


In [28]:
with open('../data_sample/k8s_docs_chunked.json', 'rt') as f_in:
    docss = json.load(f_in)

In [29]:
len(docss)

410

In [30]:
docs = []
i = 0
for record in docss:
    for chunk in record["chunks"]:
        docs.append({
            "id": i,
            "title": record["title"],
            "content_type": record["content_type"],
            "source_file": record["source_file"],
            "weight": record["weight"],
            "chunk_id": chunk["id"],
            "chunk_text": chunk["text"]
        })
        i += 1

In [31]:
len(docs)

10097

In [33]:
with open("../data_sample/k8s_docs_chunked_complete.json", "w", encoding="utf-8") as f:
        json.dump(docs, f, indent=2, ensure_ascii=False)