# Notebook Reducer
This notebook allows us to grab text files in one folder, 
annotate them, and move them to another folder in markdown.

we should not delete the notes but, we should definitely keep a index of the notes already done so we can go back and do them if something fails

the folders will be inbox(files that were put in), knowledge(the files coming out), config(configuration files needed)

the system will categorize the notes. and if it finds a note with the same category, it add the note to it.
to save money and space probably we will do all the reduction of notes first, then we will write something else to refine the output folder



In [None]:
"""
organize_notes_chatgpt.py
Summarize, categorize, and organize .txt files into Markdown folders using ChatGPT (via LangChain).
ChatGPT generates the ENTIRE Markdown (with YAML front-matter) for both new notes and merges.

Setup:
  pip install langchain langchain-openai pydantic PyYAML
  export OPENAI_API_KEY=...
Run:
  python organize_notes_chatgpt.py --src ./inbox --dest ./knowledge --dry-run
"""


In [1]:
import argparse
import asyncio
import datetime as dt
import glob
import hashlib
import os
import re
import shutil
from pathlib import Path
from typing import List, Optional, Tuple

In [2]:
import yaml

# LangChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [3]:

# ---------------------------
# Helpers
# ---------------------------

def now_iso() -> str:
    return dt.datetime.now().astimezone().replace(microsecond=0).isoformat()

def sha256(s: str) -> str:
    import hashlib as _h
    return _h.sha256(s.encode("utf-8")).hexdigest()

def read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def write_text(path: Path, text: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(text, encoding="utf-8")

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def list_markdown_files(root: Path) -> List[Path]:
    return [Path(p) for p in glob.glob(str(root / "**" / "*.md"), recursive=True)]

FM_BOUNDARY = re.compile(r"^---\s*$", re.M)

def parse_front_matter(md_text: str) -> Tuple[dict, str]:
    parts = FM_BOUNDARY.split(md_text)
    if len(parts) >= 3:
        yml = parts[1]
        body = "".join(parts[2:]).lstrip("\n")
        try:
            fm = yaml.safe_load(yml) or {}
        except Exception:
            fm = {}
        return fm, body
    return {}, md_text

def path_for(dest: Path, folder: str, slug: str) -> Path:
    # Normalize folder components to safe path
    folder = "/".join([re.sub(r"[^A-Za-z0-9._ -]", "", c).strip().strip(".") for c in folder.split("/") if c.strip()])
    slug = re.sub(r"[^a-z0-9-]", "-", slug.lower()).strip("-")
    return dest / folder / f"{slug}.md"

In [4]:
# ---------------------------
# LLM Chains (ChatGPT writes markdown)
# ---------------------------

SYSTEM_NEW = """You are a meticulous knowledge base editor.
Create a COMPLETE Markdown document with YAML front-matter for a personal knowledge base.
Strict requirements:
- Begin with YAML front-matter delimited by '---' on its own lines.
- Front-matter MUST include:
  title (string),
  slug (kebab-case, <=64 chars, filesystem-safe),
  folder (e.g., "Domain/Subdomain" with 1–2 levels, no leading slash),
  tags (string list),
  merge_key (stable canonical key for this topic),
  created_at (ISO 8601),
  updated_at (ISO 8601).
- Body structure:
  '# {title}'
  '## Summary' (3–6 sentences)
  '## Key Points' (5–10 bullets, terse, no duplicates)
  Optionally '## Source Text' in fenced block if requested.
- Avoid PII in title/tags. Prefer specific, concrete titles. If no clear category, folder='Misc'.
- Return ONLY the Markdown file content.
"""

USER_NEW = """Create the complete Markdown from this raw text.

Raw text:
---
{content}
---

Include full source text section: {include_fulltext}
Timestamps to set (ISO): created_at={created_at}, updated_at={updated_at}
"""

SYSTEM_MERGE = """You are a careful editor merging a new source into an existing Markdown note.
Return a FULLY UPDATED Markdown file that:
- Preserves and updates YAML front-matter. Keep existing title/slug/folder/merge_key unless the new source clearly improves them; always keep merge_key the same.
- Update 'updated_at' to the provided ISO timestamp.
- Deduplicate bullets and facts across versions; keep content crisp.
- Keep existing sections; append an '## Update — {timestamp}' section summarizing new information in 2–5 bullets.
- If '## Key Points' exists, refresh the list to include the most important, deduped points (max 12 bullets).
- If requested, include a 'New Source Text' fenced block under the update.
- Maintain clean Markdown. Return ONLY the Markdown file.
"""

USER_MERGE = """Here is the current Markdown file:

<current_md>
{current_md}
</current_md>

Here is the new raw text to merge:
<new_text>
{new_text}
</new_text>

Include 'New Source Text' block: {include_fulltext}
New updated_at to set (ISO): {updated_at_iso}
"""

In [5]:
def make_llm(model: str, api_key: Optional[str]) -> ChatOpenAI:
    # temperature low for determinism; you can flip to 0.3 if you want more creative tags/folders
    return ChatOpenAI(model=model, temperature=0.2, api_key=api_key)

async def generate_markdown(llm: ChatOpenAI, content: str, include_fulltext: bool) -> str:
    prompt = ChatPromptTemplate.from_messages(
        [("system", SYSTEM_NEW),
         ("user", USER_NEW)]
    )
    created = now_iso()
    updated = created
    chain = prompt | llm
    res = await chain.ainvoke({
        "content": content,
        "include_fulltext": str(bool(include_fulltext)).lower(),
        "created_at": created,
        "updated_at": updated,
    })
    return res.content.strip()

async def merge_markdown(llm: ChatOpenAI, current_md: str, new_text: str, include_fulltext: bool) -> str:
    prompt = ChatPromptTemplate.from_messages(
        [("system", SYSTEM_MERGE),
         ("user", USER_MERGE)]
    )
    chain = prompt | llm
    res = await chain.ainvoke({
        "current_md": current_md,
        "new_text": new_text,
        "include_fulltext": str(bool(include_fulltext)).lower(),
        "updated_at_iso": now_iso(),
    })
    return res.content.strip()

In [6]:
# ---------------------------
# Pipeline
# ---------------------------

class Pipeline:
    def __init__(self, src: Path, dest: Path, model: str, api_key: Optional[str],
                 dry_run: bool, include_fulltext: bool, dup_action: str):
        self.src = src
        self.dest = dest
        self.llm = make_llm(model, api_key)
        self.dry_run = dry_run
        self.include_fulltext = include_fulltext
        self.dup_action = dup_action  # 'archive' | 'delete' | 'none'
        ensure_dir(dest)

    def _find_by_merge_key(self, merge_key: str) -> List[Path]:
        hits = []
        for md in list_markdown_files(self.dest):
            fm, _ = parse_front_matter(read_text(md))
            if fm.get("merge_key") == merge_key:
                hits.append(md)
        return hits

    def _extract_route(self, md_text: str) -> Tuple[str, str, str]:
        fm, _ = parse_front_matter(md_text)
        folder = fm.get("folder") or "Misc"
        slug = fm.get("slug") or "untitled"
        merge_key = fm.get("merge_key") or slug
        return folder, slug, merge_key

    async def _process_new_or_merge(self, txt_path: Path):
        raw = read_text(txt_path).strip()
        if not raw:
            print(f"[skip empty] {txt_path}")
            return
        chk = sha256(raw)

        # First, create a candidate MD so we can read merge_key/folder/slug
        md_candidate = await generate_markdown(self.llm, raw, self.include_fulltext)
        folder, slug, merge_key = self._extract_route(md_candidate)

        existing = self._find_by_merge_key(merge_key)

        if not existing:
            out_path = path_for(self.dest, folder, slug)
            if self.dry_run:
                print(f"[create] {out_path} (dry-run)")
            else:
                # inject checksum/source_path into front-matter
                fm, body = parse_front_matter(md_candidate)
                fm["checksum"] = chk
                fm["source_path"] = str(txt_path.resolve())
                # reserialize
                yml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True).strip()
                final_md = f"---\n{yml}\n---\n\n{body}"
                write_text(out_path, final_md)
                print(f"[created] {out_path}")
        else:
            # Choose primary: prefer one whose folder/slug match candidate route
            target_path = path_for(self.dest, folder, slug)
            primary = None
            for mdp in existing:
                if mdp.resolve() == target_path.resolve():
                    primary = mdp
                    break
            if primary is None:
                primary = existing[0]

            current_md = read_text(primary)
            merged_md = await merge_markdown(self.llm, current_md, raw, self.include_fulltext)

            if self.dry_run:
                print(f"[update] {primary} (dry-run)")
            else:
                # ensure we persist checksum of latest source and update updated_at (already set by model)
                fm, body = parse_front_matter(merged_md)
                fm = fm or {}
                fm["checksum_last_source"] = chk
                fm.setdefault("source_paths", [])
                # merge source_paths as a set
                try:
                    paths = set(fm.get("source_paths", []))
                except Exception:
                    paths = set()
                paths.add(str(txt_path.resolve()))
                fm["source_paths"] = sorted(paths)
                yml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True).strip()
                final_md = f"---\n{yml}\n---\n\n{body}"
                write_text(primary, final_md)
                print(f"[updated] {primary}")

            # Handle duplicates (other files with same merge_key)
            for mdp in existing:
                if mdp.resolve() == primary.resolve():
                    continue
                if self.dup_action == "none":
                    continue
                if self.dry_run:
                    print(f"[dup-{self.dup_action}] {mdp} -> {primary} (dry-run)")
                    continue
                if self.dup_action == "delete":
                    mdp.unlink(missing_ok=True)
                    print(f"[deleted-dup] {mdp}")
                else:
                    # archive
                    archive_dir = primary.parent / "_archive"
                    ensure_dir(archive_dir)
                    dst = archive_dir / mdp.name
                    shutil.move(str(mdp), str(dst))
                    print(f"[archived-dup] {mdp} -> {dst}")

        # Move processed source
        processed_dir = self.dest / "_processed_src"
        ensure_dir(processed_dir)
        if self.dry_run:
            print(f"[move-src] {txt_path} -> {processed_dir / txt_path.name} (dry-run)")
        else:
            shutil.move(str(txt_path), str(processed_dir / txt_path.name))

    async def run(self):
        txts = sorted([p for p in self.src.glob("**/*.txt") if p.is_file()])
        if not txts:
            print(f"[no-input] No .txt files under {self.src}")
            return
        sem = asyncio.Semaphore(4)

        async def task(p: Path):
            async with sem:
                await self._process_new_or_merge(p)

        await asyncio.gather(*[task(p) for p in txts])


In [None]:



pipeline = Pipeline(
        src=src,
        dest=dest,
        model=model,
        api_key=openai_api_key,
        dry_run=dry_run,
        include_fulltext=include_fulltext,
        dup_action=dup_action,
    )