<a href="https://colab.research.google.com/github/prosy/Augmented-Worlds/blob/main/Copy_of_Mazda_RAG_Orchestration_MVP_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mazda Owner's Manual RAG — **MVP Orchestration Notebook (v3)**

**Goal:** Keep Mazda-specific structure, but include a tiny in-notebook BM25 test and clear comments for swapping to FAISS/ColBERT.
Outputs export to `/app/` so your live app can use them immediately.

## 0) Setup (install if needed)

Notebook is dependency-light. You can run as-is for the demo (BM25-ish). If you want FAISS/ColBERT, install and wire in the optional cells later.

## 1) Mount Google Drive

In [None]:
from pathlib import Path
import json, csv, re, time, shutil, math

from google.colab import drive
drive.mount('/content/drive')

## 2) Config Paths

In [None]:
from pathlib import Path

# Set project output root explicitly
WORK_ROOT = Path("/content/drive/MyDrive/AugWorlds/Mazda_PDFs/")

# Define subdirectories inside Mazda_PDFs for pipeline outputs
PARSED_DIR = WORK_ROOT / "parsed"
ENRICHED_DIR = WORK_ROOT / "enriched"
INDEX_DIR = WORK_ROOT / "index"
APP_DIR = WORK_ROOT / "app"  # app configs also stored here

# Mazda-specific input PDF
INPUT_PDF = WORK_ROOT / "2024-cx-50-owners-manual.pdf"  # Mazda PDF lives in same folder
MODEL_YEAR = "2024"  # for configs if needed

# Config files used by the live app
SYNONYMS_CSV = APP_DIR / "config" / "synonyms.csv"
ANS_PLANS_JSON = APP_DIR / "config" / "answer_plans.json"
RETRIEVAL_CFG = APP_DIR / "config" / "retrieval_config.yaml"

# Ensure output directories exist
for d in [PARSED_DIR, ENRICHED_DIR, INDEX_DIR, APP_DIR, APP_DIR / "config"]:
    d.mkdir(parents=True, exist_ok=True)

print("WORK_ROOT:", WORK_ROOT)
print("INPUT_PDF:", INPUT_PDF)

if INPUT_PDF.is_file():
    print(f"✅ Found Mazda PDF: {INPUT_PDF}")
else:
    print(f"❌ PDF not found or path incorrect: {INPUT_PDF}")

## 3) Parsing: chunk pages + harvest TOC lines

In [None]:
# Replace with your real parser (PyMuPDF/Tika). Demo JSONL for structure.

# Helper function to extract metadata from filename
def extract_metadata_from_filename(pdf_path):
    import re
    import os
    filename = os.path.basename(pdf_path)
    # Updated pattern to handle more filename variations
    patterns = [
        r'(\d{4})-([a-z0-9]+)-owners-manual',
        r'(\d{4})-mazda([a-z0-9]+)-',
        r'(\d{4})-([a-z0-9-]+)-owners'
    ]

    for pattern in patterns:
        match = re.search(pattern, filename.lower())
        if match:
            year, model_match = match.groups()
            # Clean up model string
            model = re.sub(r'[^a-z0-9]', '', model_match)
            return {"filename": filename, "year": int(year), "model": model, "page_count": None}

    # Default return if no pattern matches
    return {"filename": filename, "year": None, "model": None, "page_count": None}


# Find all PDF files in the WORK_ROOT directory
pdf_files = list(WORK_ROOT.glob("*.pdf"))

processed_parsed_files = []

if not pdf_files:
    print(f"No PDF files found in {WORK_ROOT}. Please ensure your PDF(s) are in this directory.")
else:
    print(f"Found {len(pdf_files)} PDF file(s) to process.")

    for pdf_file in pdf_files:
        print(f"\n{'='*60}")
        print(f"Processing file: {pdf_file.name}")
        print(f"{'='*60}")

        # Extract metadata from the filename
        metadata = extract_metadata_from_filename(pdf_file)
        print(f"Extracted metadata: {metadata}")

        # Define the output path for the parsed data for THIS file
        # Use extracted metadata to name the output file
        year_str = metadata.get('year', 'unknown')
        model_str = metadata.get('model', 'unknown')
        PARSED_JSONL_FOR_FILE = PARSED_DIR / f"sections_{year_str}_{model_str}.jsonl"
        processed_parsed_files.append(PARSED_JSONL_FOR_FILE)


        # Replace this with your actual PDF parsing logic that would return sections for this file
        # For the demo, we'll create sample data based on the metadata
        file_demo_sections = [
            {"manual_ref": "1-1", "title": f"Introduction ({metadata.get('year')} {metadata.get('model')})", "text": f"Welcome to your {metadata.get('year')} {metadata.get('model')} Mazda owner's manual.", "metadata": metadata},
            {"manual_ref": "2-14", "title": "TPMS", "text": "TPMS monitors tire pressure and warns if it is low.", "metadata": metadata},
            {"manual_ref": "3-2", "title": "Engine Oil", "text": "Use 0W-20. Check level regularly.", "metadata": metadata},
        ]

        # Write demo sections for THIS file to its specific JSONL file
        with open(PARSED_JSONL_FOR_FILE, "w", encoding="utf-8") as f:
            for i, row in enumerate(file_demo_sections):
                # Assign a unique _id for sections within this file (or a global one if preferred later)
                row["_id"] = i
                f.write(json.dumps(row) + "\n")

        print(f"Parsed (using demo data and filename metadata) → {PARSED_JSONL_FOR_FILE}")

    # Store the list of processed parsed files for the next step
    # This could be saved to a file or passed as a variable
    # For now, we'll rely on the next cell finding files in PARSED_DIR
    print("\nFinished processing all PDF files with demo data.")

## 4) Enrichment: fill `manual_ref` for Top-20 plans (per year/manual)

In [None]:
def load_synonyms(path=SYNONYMS_CSV):
    norm = {}
    if path.exists():
        import csv
        with open(path, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                canonical = row["canonical"].strip()
                for alias in row["alias_list"].split("|"):
                    norm[alias.strip().lower()] = canonical.lower()
    return norm

def normalize_text(text: str, norm_map: dict):
    out = text
    for alias, canon in norm_map.items():
        out = re.sub(rf"\b{re.escape(alias)}\b", canon, out, flags=re.IGNORECASE)
    return out

norm_map = load_synonyms()

# Find all parsed JSONL files created in Section 2
parsed_files = list(PARSED_DIR.glob("sections_*.jsonl"))

if not parsed_files:
    print(f"No parsed files found in {PARSED_DIR}. Please run Section 2 first.")
else:
    print(f"Found {len(parsed_files)} parsed file(s) to enrich.")

    for parsed_file in parsed_files:
        print(f"\n{'='*60}")
        print(f"Enriching file: {parsed_file.name}")
        print(f"{'='*60}")

        # Define the output path for the enriched data for THIS file
        enriched_file_name = parsed_file.name.replace(".jsonl", ".enriched.jsonl")
        ENRICHED_JSONL_FOR_FILE = ENRICHED_DIR / enriched_file_name

        with open(parsed_file, "r", encoding="utf-8") as fin, open(ENRICHED_JSONL_FOR_FILE, "w", encoding="utf-8") as fout:
            for line in fin:
                rec = json.loads(line)
                rec["text_norm"] = normalize_text(rec["text"], norm_map)
                # Metadata is already included from Section 2
                fout.write(json.dumps(rec) + "\n")

        print(f"Enriched → {ENRICHED_JSONL_FOR_FILE}")

    print("\nFinished enriching all parsed files.")

## 5) Indexing: build lightweight retrieval artifacts (BM25-ish demo)

*This cell provides an in-notebook BM25-like index so the notebook works out-of-the-box.*

**Swap to FAISS/ColBERT**:
- FAISS: create vectors for `text_norm`, save `faiss.index` + `docs.jsonl` under `app/index/`.
- ColBERT: run your existing CLI to build an index and then point to it via `app/config/retrieval_config.yaml`.
Keep the *calling interface* below so your app code remains unchanged.

In [11]:
from collections import defaultdict
import json
import math
import re
from pathlib import Path

# Define output paths for the index and docs files (using a generic name for combined data)
INDEX_JSON = INDEX_DIR / "bm25_index_combined.json"
DOCS_JSON = INDEX_DIR / "docs_combined.jsonl"

# Load docs from all enriched files
docs = []
enriched_files = list(ENRICHED_DIR.glob("*.enriched.jsonl"))

if not enriched_files:
    print(f"No enriched files found in {ENRICHED_DIR}. Please run Section 3 first.")
else:
    print(f"Found {len(enriched_files)} enriched file(s) to index.")
    for enriched_file in enriched_files:
        print(f"Loading data from {enriched_file.name}...")
        with open(enriched_file, "r", encoding="utf-8") as f:
            for line in f:
                rec = json.loads(line)
                # Ensure a unique _id across all documents if needed,
                # or rely on the original _id if it's unique per manual and you handle it later.
                # For simplicity here, we'll re-assign a global _id.
                docs.append(rec)

# Re-assign global _ids after loading all documents
for i, doc in enumerate(docs):
    doc["_id"] = i

N = len(docs)
df = defaultdict(int)
postings = defaultdict(list)   # term -> list of (doc_id, tf)
doc_len = {}

# Re-implement tokenize function here to ensure it's available
def tokenize(s):
    return re.findall(r"[a-z0-9_]+", s.lower())

for d in docs:
    tokens = tokenize(d["text_norm"])
    doc_len[d["_id"]] = len(tokens)
    tf = defaultdict(int)
    for t in tokens:
        tf[t] += 1
    for t, c in tf.items():
        df[t] += 1
        postings[t].append((d["_id"], c))

index_obj = {
    "N": N,
    "df": dict(df),
    "doc_len": {int(k): int(v) for k, v in doc_len.items()},
    "postings": {t: [(int(d), int(tf)) for d, tf in lst] for t, lst in postings.items()},
}

INDEX_DIR.mkdir(parents=True, exist_ok=True)
with open(DOCS_JSON, "w", encoding="utf-8") as f:
    for d in docs:
        f.write(json.dumps(d) + "\n")
with open(INDEX_JSON, "w", encoding="utf-8") as f:
    json.dump(index_obj, f)

print("Index written →", INDEX_JSON)

Found 13 enriched file(s) to index.
Loading data from sections_2024.enriched.jsonl...
Loading data from sections_combined_2024.enriched.jsonl...
Loading data from sections_2023_cx9.enriched.jsonl...
Loading data from sections_2025_3.enriched.jsonl...
Loading data from sections_2025_cx70.enriched.jsonl...
Loading data from sections_2025_cx70phev.enriched.jsonl...
Loading data from sections_2025_cx90.enriched.jsonl...
Loading data from sections_2025_cx90phev.enriched.jsonl...
Loading data from sections_2025_cx30vehicle.enriched.jsonl...
Loading data from sections_2023_cx30.enriched.jsonl...
Loading data from sections_2024_cx50.enriched.jsonl...
Loading data from sections_2023_3.enriched.jsonl...
Loading data from sections_2022_cx9.enriched.jsonl...
Index written → /content/drive/MyDrive/AugWorlds/Mazda_PDFs/index/bm25_index_combined.json


## 6) Retrieval helper (for your app)

BM25-ish search now; swap internals for FAISS/ColBERT and keep the same function signature.

In [13]:
def load_index(path=INDEX_JSON):
    import json
    from pathlib import Path
    return json.loads(Path(path).read_text())

def bm25_search(query, k=5, k1=1.5, b=0.75):
    import math, re, json
    idx = load_index()
    N = idx["N"]
    scores = __import__("collections").defaultdict(float)
    tokens = re.findall(r"[a-z0-9_]+", query.lower())
    avgdl = sum(idx["doc_len"].values())/max(1, N)

    # Load docs into a dictionary keyed by their _id
    doc_map = {}
    with open(DOCS_JSON, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            doc_map[doc["_id"]] = doc

    for t in tokens:
        df = idx["df"].get(t, 0)
        if df == 0:
            continue
        idf = math.log((N - df + 0.5) / (df + 0.5) + 1)
        for doc_id, tf in idx["postings"].get(t, []):
            # Retrieve doc length from the doc_map using the correct doc_id
            dl = len(re.findall(r"[a-z0-9_]+", doc_map[doc_id]["text_norm"].lower()))
            denom = tf + k1*(1 - b + b*dl/avgdl)
            scores[doc_id] += idf * (tf*(k1+1))/denom
    top = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]
    # map to docs
    results = []
    for doc_id, score in top:
        d = doc_map[doc_id]
        results.append({
            "doc_id": doc_id, "score": score,
            "manual_ref": d["manual_ref"], "title": d["title"], "snippet": d["text_norm"][:240],
            "year": d.get("metadata", {}).get("year"), # Add year from metadata
            "model": d.get("metadata", {}).get("model") # Add model from metadata
        })
    return results

print(bm25_search("tpms warning light"))

[{'doc_id': 1, 'score': 0.9996428232642339, 'manual_ref': '2-14', 'title': 'TPMS', 'snippet': 'TPMS monitors tire pressure and warns if it is low.', 'year': None, 'model': None}, {'doc_id': 4, 'score': 0.9996428232642339, 'manual_ref': '2-14', 'title': 'TPMS', 'snippet': 'TPMS monitors tire pressure and warns if it is low.', 'year': 2023, 'model': 'cx9'}, {'doc_id': 7, 'score': 0.9996428232642339, 'manual_ref': '2-14', 'title': 'TPMS', 'snippet': 'TPMS monitors tire pressure and warns if it is low.', 'year': 2025, 'model': '3'}, {'doc_id': 10, 'score': 0.9996428232642339, 'manual_ref': '2-14', 'title': 'TPMS', 'snippet': 'TPMS monitors tire pressure and warns if it is low.', 'year': 2025, 'model': 'cx70'}, {'doc_id': 13, 'score': 0.9996428232642339, 'manual_ref': '2-14', 'title': 'TPMS', 'snippet': 'TPMS monitors tire pressure and warns if it is low.', 'year': 2025, 'model': 'cx70phev'}]


## 7) One-click: Copy artifacts into `/app/` layout

In [14]:

def export_to_app():
    targets = [
        (PARSED_DIR, APP_DIR / "data" / "parsed"),
        (ENRICHED_DIR, APP_DIR / "data" / "enriched"),
        (INDEX_DIR, APP_DIR / "index"),
    ]
    for src, dst in targets:
        dst.mkdir(parents=True, exist_ok=True)
        for p in src.glob("**/*"):
            if p.is_file():
                rel = p.relative_to(src)
                (dst / rel).parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(p, dst / rel)
    print("Export complete →", APP_DIR.resolve())

export_to_app()


Export complete → /content/drive/MyDrive/AugWorlds/Mazda_PDFs/app
