# Build FAISS Index (Colab)

This notebook builds a FAISS dense retrieval index from the assert-review dataset using CodeBERT embeddings.

**Steps:**
1. Install dependencies
2. Mount Google Drive
3. Build the FAISS index
4. Verify the index

In [None]:
# Install dependencies
!pip install faiss-cpu transformers torch datasets pydantic --quiet
print('Dependencies installed.')

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import sys

# Clone or pull the repo if not already present
REPO_DIR = '/content/assert-review'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/your-org/assert-review.git {REPO_DIR}
else:
    !git -C {REPO_DIR} pull

# Add repo root to path so ml package is importable
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

print(f'Repo at: {REPO_DIR}')

In [None]:
# Build FAISS index from dataset
import os

# Optional: set W&B key for experiment tracking
# os.environ['WANDB_API_KEY'] = 'your-key-here'
# os.environ['WANDB_PROJECT'] = 'assert-review'

from ml.models.build_index import build_index

# Build with optional max_records limit for quick testing
# Set max_records=None to index everything
index = build_index(
    max_records=None,
    batch_size=64,
    index_path='/content/drive/MyDrive/assert-review/hunk_index.faiss',
)

print(f'Index built with {index.size} vectors.')

In [None]:
# Verify the index with a test query
import numpy as np
from ml.models.index import PRIndex
from ml.models.embedder import CodeEmbedder

# Load the saved index
loaded_index = PRIndex(dim=768)
loaded_index.load('/content/drive/MyDrive/assert-review/hunk_index.faiss')
print(f'Loaded index size: {loaded_index.size} vectors')

# Embed a test query and search
embedder = CodeEmbedder()
query_text = 'def test_assert_raises(): with pytest.raises(ValueError): func()'
query_vec = embedder.embed_single(query_text)

results = loaded_index.search(query_vec, k=5)
print(f'\nTop-5 results for query: {query_text[:60]}...')
print('-' * 60)
for i, r in enumerate(results):
    print(f'{i+1}. score={r.score:.4f}  file={r.filename}')
    print(f'   preview: {r.hunk_preview[:80]}')
    print()