In [None]:
import os
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Configuration
chunk_path = Path("chunks/all_chunks.json") # Input file from chunker
output_path = Path("data") # Output directory for embedded_chunks.json
embedded_file = output_path / "embedded_chunks.json" # Output file for embeddings

def run_embedder():
    """
    Loads text chunks, generates embeddings using SentenceTransformer,
    and saves them to a JSON file.
    """
    if not chunk_path.exists():
        print(f"Error: Chunks file not found at {chunk_path}. Please run chunker.ipynb first.")
        return

    with open(chunk_path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    print(f"Loaded {len(chunks)} chunks from {chunk_path}")

    # Initializing the SentenceTransformer model
    # "all-MiniLM-L6-v2" is a good balance of speed and performance.
    # Consider "all-mpnet-base-v2" for potentially better quality, but slower.
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    print("Embedding with SentenceTransformer...")

    embedded_chunks = []
    for i, chunk in enumerate(chunks):
        try:
            # Encode the text to get its embedding
            # .tolist() is necessary here to make the numpy array JSON serializable
            embedding = embed_model.encode(chunk["text"]).tolist()
            chunk["embedding"] = embedding
            embedded_chunks.append(chunk)
            if (i + 1) % 100 == 0:
                print(f"  Processed {i + 1}/{len(chunks)} chunks...")
        except Exception as e:
            print(f"Error embedding chunk {chunk.get('chunk_id', i)}: {e}")
            # Optionally, skip chunk or add a placeholder embedding
            chunk["embedding"] = None # Indicate failure
            embedded_chunks.append(chunk)


    with open(embedded_file, "w", encoding="utf-8") as f:
        json.dump(embedded_chunks, f, indent=2)

    print(f" Saved embedded chunks to {embedded_file}")
    print(f"Total embedded chunks: {len(embedded_chunks)}")