### ✅ Goal with `vault_index.json`

You want to use `vault_index.json` as a **structured representation of your Obsidian vault** to:

1. **Explore** your notes as a graph — nodes are notes, edges are links.
2. **Query** and extract relevant subgraphs based on vague prompts.
3. **Use an LLM** to:
   - Interpret vague user input
   - Identify relevant notes, tags, and links (subgraph)
   - Generate questions or summaries from the selected subgraph
4. **Feed selected notes into downstream prompts** for content generation, idea development, or question formulation.

Optional: you’re interested in experimenting with this interactively in a **Jupyter notebook**, including visualisation, semantic search, and prompt composition.


Ideas for Additional Experiments
Task	Goal
Centrality ranking	Rank notes that are most connected
Clustering	Cluster notes by content similarity or tags
Temporal views	Sort by last-modified (if tracked)
Prompt tuning	Use note summaries to improve LLM prompts

# Code

In [None]:
# ============================================
# LOAD AND INSPECT THE VAULT INDEX
# ============================================

import json
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt

# Load the vault index
with open("vault_index.json", "r", encoding="utf-8") as f:
    vault = json.load(f)


Loaded 795 notes from vault_index.json
1-on-1_template: ['title', 'path', 'tags', 'aliases', 'outlinks', 'inlinks', 'summary']
ab_testing: ['title', 'path', 'tags', 'aliases', 'outlinks', 'inlinks', 'summary']
accessing_gen_ai_generated_content: ['title', 'path', 'tags', 'aliases', 'outlinks', 'inlinks', 'summary']


In [None]:
print(f"Loaded {len(vault)} notes from vault_index.json")
# Preview a few entries
for i, (k, v) in enumerate(vault.items()):
    print(f"{k}: {list(v.keys())}")
    if i == 2: break


In [12]:
for i in [title for title in vault.keys()][100:105]:
    print(i)

conceptual_data_model
conceptual_model
concurrency
confidence_interval
confusion_matrix


In [None]:
title1="conceptual_data_model"
vault[title1]

{'title': 'conceptual data model',
 'path': 'C:\\Users\\RhysL\\Desktop\\Data-Archive\\content\\standardised\\conceptual data model.md',
 'tags': [],
 'aliases': [],
 'outlinks': [],
 'inlinks': ['database_schema'],
 'summary': ''}

In [15]:
title2="confusion_matrix"
vault[title2]

{'title': None,
 'path': 'C:\\Users\\RhysL\\Desktop\\Data-Archive\\content\\standardised\\Confusion Matrix.md',
 'tags': ['evaluation'],
 'aliases': None,
 'outlinks': ['Classification',
  'Pasted image 20240120215414.png',
  'Accuracy',
  'Precision',
  'Recall',
  'F1 Score',
  'Specificity',
  'Recall',
  'Pasted image 20240116205937.png|500',
  'Pasted image 20240116210541.png|500'],
 'inlinks': ['accuracy', 'evaluation_metrics', 'logistic_regression'],
 'summary': 'Description A Confusion Matrix is a table used to evaluate the performance of a [[Classification]] model. It provides a detailed breakdown of the model\'s predictions across different classes, showing the number of true positives, true negatives, false positives, and false negatives. Purpose The confusion matrix helps identify where the classifier is making errors, indicating where it is "confused" in its predictions. Structure ![[Pasted image 20240120215414.png]] Structure True Positives (TP): Correctly predicted posit

In [None]:


# ============================================
# BASIC TAG AND LINK STATS
# ============================================

# Identify notes with few or no links (potential orphans)


# Count tag frequencies
tag_counts = Counter(tag for note in vault.values() for tag in note.get("tags", []))
print("\nTop 10 Tags:")
for tag, count in tag_counts.most_common(10):
    print(f"{tag}: {count}")

# Find orphan notes (no inlinks or outlinks)
orphans = [
    title for title, note in vault.items()
    if not note.get("outlinks") and all(title not in n.get("outlinks", []) for n in vault.values())
]
print(f"\nNumber of orphan notes: {len(orphans)}")


# ============================================
# CONSTRUCT A DIRECTED LINK GRAPH
# ============================================

G = nx.DiGraph()

# Add nodes and edges
for title, note in vault.items():
    G.add_node(title, tags=note.get("tags", []))
    for outlink in note.get("outlinks", []):
        if outlink in vault:  # only if target exists
            G.add_edge(title, outlink)

print(f"\nGraph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


# ============================================
# VISUALIZE THE FULL GRAPH (small vaults only)
# ============================================

def plot_graph(graph, figsize=(12, 8), label_nodes=False):
    plt.figure(figsize=figsize)
    pos = nx.spring_layout(graph, k=0.5, seed=42)
    nx.draw(graph, pos, node_size=30, edge_color='gray', alpha=0.6)
    if label_nodes:
        nx.draw_networkx_labels(graph, pos, font_size=8)
    plt.title("Obsidian Vault Graph")
    plt.axis('off')
    plt.show()

# Warning: avoid for large graphs
# plot_graph(G, label_nodes=False)


# ============================================
# EXTRACT A SUBGRAPH BY TITLE + DEPTH
# ============================================

def get_subgraph(center_title, depth=1):
    if center_title not in G:
        print(f"'{center_title}' not found in graph.")
        return None
    sub = nx.ego_graph(G, center_title, radius=depth, directed=True)
    print(f"Subgraph has {len(sub)} nodes")
    return sub

# Example: a 1-hop neighborhood
subG = get_subgraph("Bayesian Uncertainty", depth=1)
plot_graph(subG, label_nodes=True)


# ============================================
# PROMPT PREPARATION FOR TOP-N NOTES
# ============================================

def build_prompt(note_titles, vault):
    prompt = ""
    for t in note_titles:
        note = vault.get(t, {})
        prompt += f"# {t}\n"
        prompt += f"Tags: {', '.join(note.get('tags', []))}\n"
        prompt += note.get("content", "")[:500] + "\n\n"
    return prompt

# Example usage
example_titles = list(subG.nodes)[:3]
prompt_text = build_prompt(example_titles, vault)
print(prompt_text)


# ============================================
# OPTIONAL: RANK NOTES BY CENTRALITY
# ============================================

centrality = nx.degree_centrality(G)
top_nodes = sorted(centrality.items(), key=lambda x: -x[1])[:10]
print("\nTop 10 central notes:")
for t, score in top_nodes:
    print(f"{t}: {score:.3f}")

#-----------------------
4. Search Notes by Concept or Similarity #ml #NLP
With embeddings added:

Load embedding index (e.g. FAISS)

Perform semantic search from question prompt

Return top-K relevant notes from the vault

python
Copy
Edit
# Assuming you’ve built a FAISS index keyed by title
query_vec = model.encode("Uncertainty quantification")
D, I = faiss_index.search(np.array([query_vec]), k=5)
top_titles = [titles[i] for i in I[0]]
