# Notebook Reducer
This notebook allows us to grab text files in one folder, 
annotate them, and move them to another folder in markdown.

we should not delete the notes but, we should definitely keep a index of the notes already done so we can go back and do them if something fails

the folders will be inbox(files that were put in), knowledge(the files coming out), config(configuration files needed)

the system will categorize the notes. and if it finds a note with the same category, it add the note to it.
to save money and space probably we will do all the reduction of notes first, then we will write something else to refine the output folder



In [None]:
"""
organize_notes_chatgpt.py
Summarize, categorize, and organize .txt files into Markdown folders using ChatGPT (via LangChain).
ChatGPT generates the ENTIRE Markdown (with YAML front-matter) for both new notes and merges.

Setup:
  pip install langchain langchain-openai pydantic PyYAML
  export OPENAI_API_KEY=...
Run:
  python organize_notes_chatgpt.py --src ./inbox --dest ./knowledge --dry-run
"""


In [1]:
import argparse
import asyncio
import datetime as dt
import glob
import hashlib
import os
import re
import shutil
from pathlib import Path
from typing import List, Optional, Tuple

In [2]:
import yaml

# LangChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [3]:

# ---------------------------
# Helpers
# ---------------------------

def now_iso() -> str:
    return dt.datetime.now().astimezone().replace(microsecond=0).isoformat()

def sha256(s: str) -> str:
    import hashlib as _h
    return _h.sha256(s.encode("utf-8")).hexdigest()

def read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def write_text(path: Path, text: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(text, encoding="utf-8")

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def list_markdown_files(root: Path) -> List[Path]:
    return [Path(p) for p in glob.glob(str(root / "**" / "*.md"), recursive=True)]

FM_BOUNDARY = re.compile(r"^---\s*$", re.M)

def parse_front_matter(md_text: str) -> Tuple[dict, str]:
    parts = FM_BOUNDARY.split(md_text)
    if len(parts) >= 3:
        yml = parts[1]
        body = "".join(parts[2:]).lstrip("\n")
        try:
            fm = yaml.safe_load(yml) or {}
        except Exception:
            fm = {}
        return fm, body
    return {}, md_text

def path_for(dest: Path, folder: str, slug: str) -> Path:
    # Normalize folder components to safe path
    folder = "/".join([re.sub(r"[^A-Za-z0-9._ -]", "", c).strip().strip(".") for c in folder.split("/") if c.strip()])
    slug = re.sub(r"[^a-z0-9-]", "-", slug.lower()).strip("-")
    return dest / folder / f"{slug}.md"

In [None]:
# ---------------------------
# LLM Chains (ChatGPT writes markdown)
# ---------------------------

SYSTEM_NEW = """You are a meticulous knowledge base editor.
Create a COMPLETE Markdown document with YAML front-matter for a personal knowledge base.
Strict requirements:
- Begin with YAML front-matter delimited by '---' on its own lines.
- Front-matter MUST include:
  title (string),
  slug (kebab-case, <=64 chars, filesystem-safe),
  folder (e.g., "Domain/Subdomain" with 1–2 levels, no leading slash),
  tags (string list),
  merge_key (stable canonical key for this topic),
  created_at (ISO 8601),
  updated_at (ISO 8601).
- Body structure:
  '# {title}'
  '## Summary' (3–6 sentences)
  '## Key Points' (5–10 bullets, terse, no duplicates)
  Optionally '## Source Text' in fenced block if requested.
- Avoid PII in title/tags. Prefer specific, concrete titles. If no clear category, folder='Misc'.
- Return ONLY the Markdown file content.
"""

USER_NEW = """Create the complete Markdown from this raw text.

Raw text:
---
{content}
---

Include full source text section: {include_fulltext}
Timestamps to set (ISO): created_at={created_at}, updated_at={updated_at}
"""

SYSTEM_MERGE = """You are a careful editor merging a new source into an existing Markdown note.
Return a FULLY UPDATED Markdown file that:
- Preserves and updates YAML front-matter. Keep existing title/slug/folder/merge_key unless the new source clearly improves them; always keep merge_key the same.
- Update 'updated_at' to the provided ISO timestamp.
- Deduplicate bullets and facts across versions; keep content crisp.
- Keep existing sections; append an '## Update — {timestamp}' section summarizing new information in 2–5 bullets.
- If '## Key Points' exists, refresh the list to include the most important, deduped points (max 12 bullets).
- If requested, include a 'New Source Text' fenced block under the update.
- Maintain clean Markdown. Return ONLY the Markdown file.
"""

USER_MERGE = """Here is the current Markdown file:

<current_md>
{current_md}
</current_md>

Here is the new raw text to merge:
<new_text>
{new_text}
</new_text>

Include 'New Source Text' block: {include_fulltext}
New updated_at to set (ISO): {updated_at_iso}
"""