In [1]:
import json
from pathlib import Path

import faiss
from sentence_transformers import SentenceTransformer

BOOK_CODE = "NOTW"
CHUNKS_PATH = Path(f"../data/processed/{BOOK_CODE}_chunks.jsonl")

OUT_DIR = Path("../data/index")
FAISS_PATH = OUT_DIR / f"{BOOK_CODE}.faiss"
META_PATH = OUT_DIR / f"{BOOK_CODE}_meta.jsonl"

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def read_chunks(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return rows

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    rows = read_chunks(CHUNKS_PATH)
    texts = [r["text"] for r in rows]
    print(f"Loaded {len(texts)} chunks from {CHUNKS_PATH}")

    model = SentenceTransformer(MODEL_NAME)
    embeddings = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,  # cosine similarity via inner product
    )

    dim = embeddings.shape[1]
    print(f"Embeddings shape: {embeddings.shape} (dim={dim})")

    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)

    faiss.write_index(index, str(FAISS_PATH))
    print(f"Wrote FAISS index to {FAISS_PATH}")

    with META_PATH.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps({
                "chunk_id": r["chunk_id"],
                "book": r["book"],
                "chapter": r["chapter"],
                "text": r["text"],
            }, ensure_ascii=False) + "\n")

    print(f"Wrote metadata to {META_PATH}")

if __name__ == "__main__":
    main()

Loaded 550 chunks from ../data/processed/NOTW_chunks.jsonl


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Embeddings shape: (550, 384) (dim=384)
Wrote FAISS index to ../data/index/NOTW.faiss
Wrote metadata to ../data/index/NOTW_meta.jsonl
