In [None]:
from pathlib import Path
import re
import yaml
import json
from collections import defaultdict
from markdown import markdown
from bs4 import BeautifulSoup

# === CONFIGURATION ===
VAULT_PATH = Path("C:/Users/RhysL/Desktop/Data-Archive/content/standardised")
OUTPUT_PATH = "vault_index.json"

# === HELPERS ===
def extract_frontmatter(md_text):
    """
    Extract YAML frontmatter and content from markdown text.
    """
    match = re.match(r'^---\n(.*?)\n---\n(.*)', md_text, re.DOTALL)
    if match:
        frontmatter = yaml.safe_load(match.group(1))
        content = match.group(2)
    else:
        frontmatter = {}
        content = md_text
    return frontmatter, content

def extract_links(content):
    """
    Extract links in the format [[link|display_name]] from markdown content.
    Returns only the link part before the pipe character.
    """
    return [match.split('|')[0] for match in re.findall(r'\[\[([^\]]+)\]\]', content)]

def markdown_to_text(md_content):
    """
    Convert markdown content to plain text by first converting it to HTML and then extracting the text.
    """
    html = markdown(md_content)
    soup = BeautifulSoup(html, features="html.parser")
    return soup.get_text()

def normalize_title(title):
    """
    Normalize title by converting to lowercase and replacing spaces with underscores.
    """
    try:
        return title.lower().replace(" ", "_")
    except AttributeError:
        print(f"Error normalizing title: {title}")
        return title

def summarise_text(text, word_limit=100):
    """
    Summarize text by limiting it to a specified word count.
    """
    words = text.strip().split()
    return " ".join(words[:word_limit]) + ("..." if len(words) > word_limit else "")

# === MAIN FUNCTION ===
def index_vault(vault_path):
    """
    Index all markdown files in the vault, extracting metadata, links, and generating summaries.
    This function now also restructures the vault_index.
    """
    vault_index = {}
    outlink_map = defaultdict(list)

    for note_path in vault_path.rglob("*.md"):
        with open(note_path, 'r', encoding='utf-8') as f:
            raw_md = f.read()
        
        # Extract frontmatter and content
        frontmatter, content = extract_frontmatter(raw_md)
        plain_text = markdown_to_text(content)

        # Get title and normalize it
        raw_title = frontmatter.get("title")
        title = raw_title if raw_title else note_path.stem
        # note_id = normalize_title(title)
        note_id = normalize_title(note_path.stem)  # Use filename (not title) for ID


        # Extract tags, aliases, and outlinks
        tags = frontmatter.get("tags", [])
        aliases = frontmatter.get("aliases", [])
        outlinks_raw = extract_links(content)
        outlinks = [normalize_title(link) for link in outlinks_raw]

        # Store note metadata in vault_index
        vault_index[note_id] = {
            "title": title,
            "tags": tags,
            "aliases": aliases,
            "outlinks": outlinks,
            "inlinks": [],  # Will be filled later
            "summary": summarise_text(plain_text, word_limit=25)
        }

        # Add the note's ID to the outlink_map for each link it references
        for link_id in outlinks:
            outlink_map[link_id].append(note_id)

    # Now update inlinks for each target note, ensuring no repeats
    for target_id, sources in outlink_map.items():
        if target_id in vault_index:
            # Remove duplicates by converting to a set, then back to a list
            vault_index[target_id]["inlinks"] = list(set(sources))

    # Restructure vault_index: remove duplicate outlinks and create final structure
    new_vault_index = {}
    for note_id, note_data in vault_index.items():
        # Create a new structure where the note_id is the key, excluding the redundant note_id key
        new_vault_index[note_id] = {
            "title": note_data["title"],
            "tags": note_data["tags"],
            "aliases": note_data["aliases"],
            "outlinks": list(set(note_data["outlinks"])),  # Remove duplicates from outlinks
            "inlinks": note_data["inlinks"],  # Inlinks are already handled for uniqueness
            "summary": note_data["summary"]
        }

    return new_vault_index

# === EXECUTION ===
vault_index = index_vault(VAULT_PATH)

# Write the final structured output to a JSON file
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(vault_index, f, indent=2)


Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error normalizing title: None
Error norm

In [None]:
# # read json file
# with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
#     vault_index = json.load(f)

## Investigation

In [57]:
list(vault_index.keys())[:5]  # Show a sample of indexed note IDs

['1-on-1_template',
 'ab_testing',
 'accessing_gen_ai_generated_content',
 'accuracy',
 'acid_transaction']

In [None]:
# # get detail for a specific note
id="views"
vault_index[id]
# vault_index[id]['tags']

{'title': 'Views',
 'tags': ['database'],
 'aliases': [],
 'outlinks': ['soft_deletion',
  'sqlite',
  'querying',
  'common_table_expression',
  'de_tools',
  'view_use_case',
  'database_schema'],
 'inlinks': ['common_table_expression'],
 'summary': 'Views are virtual tables defined by SQL [[Querying|Query]] that ==simplify complex data representation.== They can remove unnecessary columns, aggregate results, partition data, and secure sensitive...'}

In [59]:
# # get detail for a specific note
id="common_table_expression"
vault_index[id]

{'title': 'Common Table Expression',
 'tags': ['database', 'querying'],
 'aliases': ['CTE'],
 'outlinks': ['de_tools', 'views', 'recursive_algorithm', 'querying'],
 'inlinks': ['views'],
 'summary': 'A Common Table Expression (CTE) is a temporary named result set that you can reference within a SELECT, INSERT, UPDATE, or DELETE statement. The CTE...'}

In [None]:
# # count the number of entries
print(len(vault_index))
# # count the number of files in:
VAULT_PATH = Path("C:/Users/RhysL/Desktop/Data-Archive/content/standardised")
print(len(list(VAULT_PATH.rglob("*.md"))))

795