# IMDb 02 BGE-M3 Embeddings + FAISS

Objective:
- Encode `movieDoc` using `BAAI/bge-m3`.
- Build an exact cosine FAISS index (`IndexFlatIP` on normalized vectors).
- Save artifacts and update the build manifest.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install -U sentence-transformers faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.13.2


In [3]:
from __future__ import annotations

import json
import time
from pathlib import Path

import faiss
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

OUT_DIR = Path('/content/drive/MyDrive/cinematch/outputs/imdb')
CATALOG_PATH = OUT_DIR / 'imdb_movies_catalog.csv'
META_PATH = OUT_DIR / 'imdb_movies_meta.csv'
FAISS_PATH = OUT_DIR / 'imdb_movies_bge_m3_flatip.faiss'
EMB_MEMMAP_PATH = OUT_DIR / 'imdb_movies_bge_m3_embeddings.float32.mmap'
CHECKPOINT_PATH = OUT_DIR / 'imdb_movies_bge_m3_checkpoint.json'
MANIFEST_PATH = OUT_DIR / 'imdb_movies_build_manifest.json'

MODEL_ID = 'BAAI/bge-m3'
ENCODE_OUTER_BATCH = 32768
ENCODE_INNER_BATCH = 2048
INDEX_ADD_BATCH = 200_000

# Recovery + validation knobs
FORCE_REENCODE = False
VALIDATION_SAMPLE_SIZE = 2000
MIN_NORM_MEAN = 0.70
MAX_ZERO_NORM_FRAC = 0.05
VALIDATE_MIN_WRITTEN_ROWS = 4096

SANITY_TOP_K = 10

assert CATALOG_PATH.exists(), f'Missing catalog: {CATALOG_PATH}'
assert META_PATH.exists(), f'Missing meta: {META_PATH}'


## Load Catalog + Initialize Model


In [4]:
catalog = pd.read_csv(CATALOG_PATH, usecols=['row_id', 'movieDoc'], low_memory=False)
meta = pd.read_csv(META_PATH, low_memory=False)

catalog['movieDoc'] = catalog['movieDoc'].fillna('').astype(str)
row_ids = catalog['row_id'].to_numpy(dtype=np.int64)
texts = catalog['movieDoc'].tolist()
n_rows = len(texts)

assert n_rows == len(meta), 'Catalog and meta row count mismatch.'

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

model = SentenceTransformer(MODEL_ID, device=device)
emb_dim = model.get_sentence_embedding_dimension()
if emb_dim is None:
    probe = model.encode(['embedding dimension probe'], convert_to_numpy=True, normalize_embeddings=True)
    emb_dim = int(probe.shape[1])

print('Rows:', n_rows)
print('Device:', device)
print('Model:', MODEL_ID)
print('Embedding dimension:', emb_dim)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Rows: 737654
Device: cuda
Model: BAAI/bge-m3
Embedding dimension: 1024


## Encode With Checkpointed Resume


In [None]:
def sample_norm_stats(arr: np.memmap, row_count: int, sample_size: int = VALIDATION_SAMPLE_SIZE, seed: int = 7):
    if row_count <= 0:
        return 0.0, 0.0, 0.0, 1.0
    take = min(int(sample_size), int(row_count))
    rng = np.random.default_rng(seed)
    idx = rng.choice(row_count, size=take, replace=False)
    sample = np.asarray(arr[idx], dtype='float32')
    norms = np.linalg.norm(sample, axis=1)
    zero_frac = float((norms == 0).mean())
    return float(norms.min()), float(norms.mean()), float(norms.max()), zero_frac


embed_started = time.time()
expected_bytes = n_rows * emb_dim * np.dtype('float32').itemsize

checkpoint_state = {}
if CHECKPOINT_PATH.exists():
    checkpoint_state = json.loads(CHECKPOINT_PATH.read_text(encoding='utf-8'))

reset_reasons = []
if FORCE_REENCODE:
    reset_reasons.append('FORCE_REENCODE=True')

if EMB_MEMMAP_PATH.exists():
    actual_bytes = EMB_MEMMAP_PATH.stat().st_size
    if actual_bytes != expected_bytes:
        reset_reasons.append(f'mmap size mismatch expected={expected_bytes} actual={actual_bytes}')

if checkpoint_state:
    ckpt_model = checkpoint_state.get('model_id')
    ckpt_dim = int(checkpoint_state.get('embedding_dim', -1))
    if ckpt_model and ckpt_model != MODEL_ID:
        reset_reasons.append(f'checkpoint model mismatch: {ckpt_model} != {MODEL_ID}')
    if ckpt_dim > 0 and ckpt_dim != emb_dim:
        reset_reasons.append(f'checkpoint dim mismatch: {ckpt_dim} != {emb_dim}')

need_reset = len(reset_reasons) > 0
if need_reset:
    print('Resetting embedding artifacts due to:')
    for reason in reset_reasons:
        print(' -', reason)
    for f in [EMB_MEMMAP_PATH, CHECKPOINT_PATH, FAISS_PATH]:
        if f.exists():
            f.unlink()

start_idx = 0
if not need_reset and checkpoint_state:
    start_idx = int(checkpoint_state.get('next_index', 0))
    start_idx = min(max(start_idx, 0), n_rows)

mmap_mode = 'r+' if EMB_MEMMAP_PATH.exists() else 'w+'
emb_memmap = np.memmap(EMB_MEMMAP_PATH, mode=mmap_mode, dtype='float32', shape=(n_rows, emb_dim))

if (start_idx >= VALIDATE_MIN_WRITTEN_ROWS) and (not need_reset):
    nmin, nmean, nmax, zfrac = sample_norm_stats(emb_memmap, start_idx)
    print(f'Checkpoint prefix norm stats @ {start_idx:,}: min={nmin:.4f}, mean={nmean:.4f}, max={nmax:.4f}, zero_frac={zfrac:.4f}')
    if (nmean < MIN_NORM_MEAN) or (zfrac > MAX_ZERO_NORM_FRAC):
        print('Detected corrupted/zero embeddings in checkpointed prefix. Restarting encode from scratch.')
        for f in [EMB_MEMMAP_PATH, CHECKPOINT_PATH, FAISS_PATH]:
            if f.exists():
                f.unlink()
        emb_memmap = np.memmap(EMB_MEMMAP_PATH, mode='w+', dtype='float32', shape=(n_rows, emb_dim))
        start_idx = 0

if start_idx >= n_rows:
    print('Checkpoint already at full length. Skipping encode loop.')

for start in range(start_idx, n_rows, ENCODE_OUTER_BATCH):
    end = min(start + ENCODE_OUTER_BATCH, n_rows)
    batch_texts = texts[start:end]

    vectors = model.encode(
        batch_texts,
        batch_size=ENCODE_INNER_BATCH,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,
    ).astype('float32')

    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    vectors = vectors / norms

    emb_memmap[start:end] = vectors

    if (end % (ENCODE_OUTER_BATCH * 10) == 0) or (end == n_rows):
        emb_memmap.flush()
        state = {
            'model_id': MODEL_ID,
            'embedding_dim': int(emb_dim),
            'next_index': int(end),
            'completed': bool(end == n_rows),
            'timestamp_utc': pd.Timestamp.utcnow().isoformat(),
        }
        CHECKPOINT_PATH.write_text(json.dumps(state, indent=2), encoding='utf-8')
        print(f'Encoded {end:,}/{n_rows:,}')

emb_memmap.flush()
encode_seconds = time.time() - embed_started
print('Encoding complete in seconds:', round(encode_seconds, 2))


Encoded 327,680/737,654
Encoded 655,360/737,654
Encoded 737,654/737,654
Encoding complete in seconds: 3222.58


## Build FAISS Index + Update Manifest


In [None]:
# Validate embeddings before building FAISS to avoid all-zero/corrupt index builds.
nmin, nmean, nmax, zfrac = sample_norm_stats(emb_memmap, n_rows)
print(f'Embedding norm stats: min={nmin:.4f}, mean={nmean:.4f}, max={nmax:.4f}, zero_frac={zfrac:.4f}')
assert nmean >= MIN_NORM_MEAN, 'Embeddings look invalid: mean norm too low.'
assert zfrac <= MAX_ZERO_NORM_FRAC, 'Embeddings look invalid: too many zero vectors.'

faiss_started = time.time()

index = faiss.IndexIDMap2(faiss.IndexFlatIP(emb_dim))

for start in range(0, n_rows, INDEX_ADD_BATCH):
    end = min(start + INDEX_ADD_BATCH, n_rows)
    vec_chunk = np.asarray(emb_memmap[start:end], dtype='float32')
    id_chunk = row_ids[start:end]
    index.add_with_ids(vec_chunk, id_chunk)

faiss.write_index(index, str(FAISS_PATH))
faiss_seconds = time.time() - faiss_started

assert int(index.ntotal) == n_rows, 'FAISS ntotal mismatch.'

manifest = {}
if MANIFEST_PATH.exists():
    manifest = json.loads(MANIFEST_PATH.read_text(encoding='utf-8'))

manifest['embedding'] = {
    'timestamp_utc': pd.Timestamp.utcnow().isoformat(),
    'model_id': MODEL_ID,
    'embedding_dim': int(emb_dim),
    'rows_encoded': int(n_rows),
    'device': device,
    'norm_check': {
        'sample_min': nmin,
        'sample_mean': nmean,
        'sample_max': nmax,
        'sample_zero_frac': zfrac,
    },
    'encoding': {
        'outer_batch': int(ENCODE_OUTER_BATCH),
        'inner_batch': int(ENCODE_INNER_BATCH),
        'duration_seconds': round(float(encode_seconds), 2),
        'memmap_path': str(EMB_MEMMAP_PATH),
        'checkpoint_path': str(CHECKPOINT_PATH),
    },
    'faiss': {
        'index_type': 'IndexIDMap2(IndexFlatIP)',
        'ntotal': int(index.ntotal),
        'dimension': int(index.d),
        'add_batch': int(INDEX_ADD_BATCH),
        'duration_seconds': round(float(faiss_seconds), 2),
        'faiss_path': str(FAISS_PATH),
    },
}

MANIFEST_PATH.write_text(json.dumps(manifest, indent=2), encoding='utf-8')

print('Saved FAISS index:', FAISS_PATH)
print('Updated manifest:', MANIFEST_PATH)
print('FAISS ntotal:', index.ntotal)

sanity_query = 'emotional family drama with redemption and sacrifice'
qvec = model.encode([sanity_query], convert_to_numpy=True, normalize_embeddings=True).astype('float32')

scores, ids = index.search(qvec, SANITY_TOP_K)
sanity_hits = pd.DataFrame({'row_id': ids[0], 'faiss_score': scores[0]})
sanity_hits = sanity_hits.merge(meta, on='row_id', how='left')
sanity_hits[['row_id', 'faiss_score', 'primaryTitle', 'startYear', 'lang_bucket']]


Embedding norm stats: min=1.0000, mean=1.0000, max=1.0000, zero_frac=0.0000
Saved FAISS index: /content/drive/MyDrive/cinematch/outputs/imdb/imdb_movies_bge_m3_flatip.faiss
Updated manifest: /content/drive/MyDrive/cinematch/outputs/imdb/imdb_movies_build_manifest.json
FAISS ntotal: 737654


Unnamed: 0,row_id,faiss_score,primaryTitle,startYear,lang_bucket
0,100965,0.631049,The Redemption,2000.0,en
1,301282,0.614824,Redemption,2011.0,en
2,315160,0.612327,Redemption for Easter,2021.0,en
3,497271,0.612005,Redemption,2013.0,ja
4,422293,0.611317,"Faith, Hope & Grace",,en
5,322071,0.607964,Cost of a Soul,2010.0,en
6,523751,0.607309,Redemption,,en
7,714851,0.606916,Redemption,2018.0,en
8,331192,0.606205,Reconciliation,2009.0,en
9,466831,0.602594,Family Therapy,2024.0,ja
