In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install sentence-transformers faiss-gpu-cu12 pandas numpy

In [None]:
import sys
print(sys.executable)

/blue/egn6933/nagabhairava.r/cinienv/bin/python


In [None]:
import torch
print(torch.__version__)

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("VRAM (GB):", torch.cuda.get_device_properties(0).total_memory / 1e9)

2.10.0+cu128
CUDA available: True
GPU: NVIDIA B200
VRAM (GB): 191.514411008


In [None]:
import os
import pandas as pd
import numpy as np

CATALOG_PATH = "/blue/egn6933/nagabhairava.r/tmdb_semantic_catalog_alllangs_with_new_movies.csv"
OUT_DIR = "/blue/egn6933/nagabhairava.r/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(CATALOG_PATH, low_memory=False)
df["id"] = pd.to_numeric(df["id"], errors="coerce")
df = df.dropna(subset=["id"]).copy()
df["id"] = df["id"].astype(int)

df["movieDoc"] = df["movieDoc"].fillna("").astype(str)

mask = df["movieDoc"].str.contains("Plot:", na=False) & (df["movieDoc"].str.len() >= 80)
df_embed = df.loc[mask, ["id", "movieDoc"]].copy()

print("Total catalog:", len(df), "Embed rows:", len(df_embed))

Total catalog: 1367793 Embed rows: 1366255


In [None]:
print(df.columns.tolist())

['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'year', 'movieDoc']


In [None]:
import torch
from sentence_transformers import SentenceTransformer

MODEL_NAME = "Qwen/Qwen3-Embedding-4B"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Loading {MODEL_NAME} on {device}...")

model = SentenceTransformer(
    MODEL_NAME,
    device=device,
    trust_remote_code=True,
    model_kwargs={
        "dtype": torch.bfloat16
    }
)

emb_dim = model.get_sentence_embedding_dimension()

print("Device:", device)
print("Model:", MODEL_NAME)
print("Embedding dim:", emb_dim)

Loading Qwen/Qwen3-Embedding-4B on cuda...




Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

Device: cuda
Model: Qwen/Qwen3-Embedding-4B
Embedding dim: 2560


In [None]:
import os
import gc
import json
import torch
import numpy as np
import faiss
import time
from pathlib import Path

# Config
OUT_DIR = Path("/blue/egn6933/nagabhairava.r/outputs/tmdbfaissqwen")
OUT_DIR.mkdir(parents=True, exist_ok=True)

EMB_MEMMAP_PATH = OUT_DIR / "tmdb_embeddings.float32.mmap"
CHECKPOINT_PATH = OUT_DIR / "tmdb_checkpoint.json"
FAISS_PATH = OUT_DIR / "tmdb_bge_m3.faiss"

ENCODE_BATCH_SIZE = 512
SAVE_INTERVAL = 10000

# Prepare data
texts = df_embed["movieDoc"].tolist()
ids = df_embed["id"].to_numpy(dtype=np.int64)
n_rows = len(texts)
emb_dim = model.get_sentence_embedding_dimension()

print(f"Total rows to encode: {n_rows}")
print(f"Embedding dimension: {emb_dim}")
print(f"Output dir: {OUT_DIR}")

Total rows to encode: 1366255
Embedding dimension: 2560
Output dir: /blue/egn6933/nagabhairava.r/outputs/tmdbfaissqwen


In [None]:
# Memory Map & Checkpoint
if CHECKPOINT_PATH.exists():
    with open(CHECKPOINT_PATH, "r") as f:
        checkpoint = json.load(f)
    start_idx = checkpoint.get("next_idx", 0)
    print(f"Resuming from index {start_idx}...")
else:
    start_idx = 0
    if EMB_MEMMAP_PATH.exists():
        EMB_MEMMAP_PATH.unlink()

if not EMB_MEMMAP_PATH.exists():
    mm = np.memmap(EMB_MEMMAP_PATH, dtype="float32", mode="w+", shape=(n_rows, emb_dim))
    mm.flush()
    del mm

emb_memmap = np.memmap(EMB_MEMMAP_PATH, dtype="float32", mode="r+", shape=(n_rows, emb_dim))

# Chunked Encoding Loop
model.eval()

for i in range(start_idx, n_rows, ENCODE_BATCH_SIZE):
    end = min(i + ENCODE_BATCH_SIZE, n_rows)
    batch_texts = texts[i:end]

    # Encode
    with torch.no_grad():
        batch_emb = model.encode(
            batch_texts,
            batch_size=ENCODE_BATCH_SIZE,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=True
        ).astype("float32")

    # Write to memmap
    emb_memmap[i:end] = batch_emb

    # Checkpoint
    if (i + ENCODE_BATCH_SIZE >= n_rows) or (end % SAVE_INTERVAL == 0):
        emb_memmap.flush()
        with open(CHECKPOINT_PATH, "w") as f:
            json.dump({"next_idx": end}, f)
        print(f"Saved checkpoint at row {end}/{n_rows}")

    del batch_emb
    # gc.collect()

print("Encoding complete.")
emb_memmap.flush()

Saved checkpoint at row 320000/1366255
Saved checkpoint at row 640000/1366255
Saved checkpoint at row 960000/1366255
Saved checkpoint at row 1280000/1366255
Saved checkpoint at row 1366255/1366255
Encoding complete.


In [None]:
import faiss
import numpy as np
import pandas as pd
from pathlib import Path

CSV_PATH   = "/content/drive/MyDrive/cinematch/outputs/tmdb_semantic_catalog_alllangs_with_new_movies.csv"
MMAP_PATH  = "/content/drive/MyDrive/cinematch/outputs/tmdb_embeddings.float32.mmap"
FAISS_PATH = "/content/drive/MyDrive/cinematch/outputs/tmdb_bge_m3.faiss"


df = pd.read_csv(CSV_PATH, low_memory=False)
df["id"] = pd.to_numeric(df["id"], errors="coerce")
df = df.dropna(subset=["id"]).copy()
df["id"] = df["id"].astype(int)
df["movieDoc"] = df["movieDoc"].fillna("").astype(str)

mask = df["movieDoc"].str.contains("Plot:", na=False) & (df["movieDoc"].str.len() >= 80)
df_embed = df.loc[mask, ["id", "movieDoc"]].copy()

ids = df_embed["id"].to_numpy(dtype=np.int64)
n_rows = len(ids)
print(f"Filtered rows (matching embeddings): {n_rows}")


emb_dim = 2560  # Qwen3-Embedding-4B dimension
emb_memmap = np.memmap(MMAP_PATH, dtype="float32", mode="r", shape=(n_rows, emb_dim))
print(f"Memmap shape: {emb_memmap.shape}")

print("Building FAISS index...")
index = faiss.IndexIDMap2(faiss.IndexFlatIP(emb_dim))

CHUNK_SIZE = 50000
for i in range(0, n_rows, CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, n_rows)
    batch_vecs = np.array(emb_memmap[i:end])
    batch_ids = ids[i:end]
    index.add_with_ids(batch_vecs, batch_ids)
    print(f"Indexed rows {i:,} to {end:,}")

faiss.write_index(index, str(FAISS_PATH))
print(f"Saved: {index.ntotal:,} vectors to {FAISS_PATH}")

Filtered rows (matching embeddings): 1366255
Memmap shape: (1366255, 2560)
Building FAISS index...
Indexed rows 0 to 50,000
Indexed rows 50,000 to 100,000
Indexed rows 100,000 to 150,000
Indexed rows 150,000 to 200,000
Indexed rows 200,000 to 250,000
Indexed rows 250,000 to 300,000
Indexed rows 300,000 to 350,000
Indexed rows 350,000 to 400,000
Indexed rows 400,000 to 450,000
Indexed rows 450,000 to 500,000
Indexed rows 500,000 to 550,000
Indexed rows 550,000 to 600,000
Indexed rows 600,000 to 650,000
Indexed rows 650,000 to 700,000
Indexed rows 700,000 to 750,000
Indexed rows 750,000 to 800,000
Indexed rows 800,000 to 850,000
Indexed rows 850,000 to 900,000
Indexed rows 900,000 to 950,000
Indexed rows 950,000 to 1,000,000
Indexed rows 1,000,000 to 1,050,000
Indexed rows 1,050,000 to 1,100,000
Indexed rows 1,100,000 to 1,150,000
Indexed rows 1,150,000 to 1,200,000
Indexed rows 1,200,000 to 1,250,000
Indexed rows 1,250,000 to 1,300,000
Indexed rows 1,300,000 to 1,350,000
Indexed rows 1,

In [None]:
# Optional if needed:
# !pip -q install ipywidgets

import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

assert "model" in globals() and "index" in globals() and "df" in globals(), "Run previous cells first."

meta = df[["id", "title", "overview", "release_date"]].copy()
meta["id"] = pd.to_numeric(meta["id"], errors="coerce")
meta = meta.dropna(subset=["id"]).drop_duplicates("id")
meta["id"] = meta["id"].astype(np.int64)
meta = meta.set_index("id")

query = widgets.Text(description="Query:", placeholder="e.g. mind-bending sci-fi")
topk = widgets.IntSlider(description="TopK:", value=5, min=1, max=20)
btn = widgets.Button(description="Search", button_style="primary")
out = widgets.Output()

def run_search(_=None):
    with out:
        clear_output()
        q = query.value.strip()
        if not q:
            print("Enter a query.")
            return

        qvec = model.encode([q], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
        D, I = index.search(qvec, int(topk.value))

        rows = []
        for r, (mid, score) in enumerate(zip(I[0], D[0]), start=1):
            if mid == -1:
                continue
            m = meta.loc[mid] if mid in meta.index else None
            rows.append({
                "rank": r,
                "title": (m["title"] if m is not None else f"id={mid}"),
                "year": (str(m["release_date"])[:4] if m is not None and pd.notna(m["release_date"]) else ""),
                "score": float(score),
            })

        display(pd.DataFrame(rows))

btn.on_click(run_search)
query.on_submit(run_search)
display(widgets.VBox([query, topk, btn, out]))

In [None]:
# RERANKER


BAAI/bge-m3-unsupervised