In [None]:
! pip install -U sentence-transformers faiss-gpu-cu12 pandas numpy

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load catalog and filter to embed-worthy rows
import os
import pandas as pd
import numpy as np

CATALOG_PATH = "/content/drive/MyDrive/cinematch/tmdb_targetlangs_with_api_freshness.csv"
OUT_DIR = "/content/drive/MyDrive/cinematch/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(CATALOG_PATH, low_memory=False)
df["id"] = pd.to_numeric(df["id"], errors="coerce")
df = df.dropna(subset=["id"]).copy()
df["id"] = df["id"].astype(int)

df["movieDoc"] = df["movieDoc"].fillna("").astype(str)

# Embed only meaningful docs
mask = df["movieDoc"].str.contains("Plot:", na=False) & (df["movieDoc"].str.len() >= 80)
df_embed = df.loc[mask, ["id", "movieDoc"]].copy()

print("Total catalog:", len(df), "Embed rows:", len(df_embed))

Total catalog: 845256 Embed rows: 845085


In [None]:
#Load LaBSE

import torch
from sentence_transformers import SentenceTransformer

MODEL_NAME = "sentence-transformers/LaBSE"  
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, device=device)

print("Device:", device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/LaBSE
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]



2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Device: cuda


In [None]:
# embedding
texts = df_embed["movieDoc"].tolist()
ids = df_embed["id"].to_numpy(dtype=np.int64)

BATCH_SIZE = 3072

emb = model.encode(
    texts,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embeddings:", emb.shape, emb.dtype)

Batches:   0%|          | 0/276 [00:00<?, ?it/s]

Embeddings: (845085, 768) float32


In [None]:
import faiss

d = emb.shape[1]

# Cosine via inner product on normalized vectors
index = faiss.IndexFlatIP(d)
index = faiss.IndexIDMap2(index)

index.add_with_ids(emb.astype("float32"), ids)
print("FAISS ntotal:", index.ntotal)

FAISS ntotal: 845085


In [None]:
FAISS_PATH = os.path.join(OUT_DIR, "tmdb_labse_targetlangs.faiss")
faiss.write_index(index, FAISS_PATH)
print("Saved FAISS index:", FAISS_PATH)

META_PATH = os.path.join(OUT_DIR, "tmdb_meta_targetlangs.csv")
df.loc[df["id"].isin(ids), ["id","title","original_language","release_date","popularity","vote_average","vote_count"]].to_csv(META_PATH, index=False)
print("Saved meta:", META_PATH)

Saved FAISS index: /content/drive/MyDrive/cinematch/outputs/tmdb_labse_targetlangs.faiss
Saved meta: /content/drive/MyDrive/cinematch/outputs/tmdb_meta_targetlangs.csv


In [None]:
index = faiss.read_index(FAISS_PATH)

query = "dark psychological revenge thriller with a big twist"
q = model.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")

k = 10
scores, neighbors = index.search(q, k)

print("Neighbors:", neighbors[0])
print("Scores:", scores[0])

Neighbors: [1146149  380407 1011681 1550752 1531940  338391  519337  440528 1625794
 1394489]
Scores: [0.51142704 0.47760358 0.47226372 0.47211823 0.46934763 0.46632293
 0.46550277 0.46443826 0.46111393 0.46069366]


In [2]:
import pandas as pd

meta = pd.read_csv("/content/drive/MyDrive/cinematch/outputs/tmdb_meta_targetlangs.csv")  # or your meta CSV
top_ids = [1146149, 380407, 1011681, 1550752, 1531940, 338391, 519337, 440528, 1625794, 1394489]

print(meta[meta["id"].isin(top_ids)][["id","title","original_language","release_date"]])

             id                            title original_language  \
147616   338391                   Shattered Mind                en   
169561   380407                 Grandpa's Psycho                en   
199695   440528                       Dual Mania                en   
238641   519337                 Tempus Tormentum                en   
479224  1011681         Tragedy of the Red House                en   
548411  1146149                        Psychosis                en   
697109  1394489                      Monkey Mask                en   
785146  1531940  Dark Cuts: A Thriller Anthology                en   
796485  1550752                        Nightmare                en   
840791  1625794                        Nightmare                en   

       release_date  
147616   1996-05-27  
169561   2015-06-26  
199695   2021-07-03  
238641   2018-04-17  
479224          NaN  
548411   2023-08-12  
697109   2024-07-31  
785146   2025-08-06  
796485   2025-12-31  
840791   

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/cinematch/outputs/tmdb_meta_targetlangs.csv", low_memory=False)

need = ["Drishyam", "Andhadhun", "1: Nenokkadine", "Evaru", "Rakshasudu", "Gentleman", "Send Help"]
hits = df[df["title"].fillna("").str.contains("|".join(need), case=False, regex=True)][
    ["id","title","original_language","release_date","vote_count","popularity"]
].sort_values("vote_count", ascending=False)

hits.head(50)

Unnamed: 0,id,title,original_language,release_date,vote_count,popularity
1272,2623,An Officer and a Gentleman,en,1982-07-28,996,22.31
245997,534780,Andhadhun,hi,2018-10-05,429,13.058
155208,352173,Drishyam,hi,2015-07-30,349,11.832
3748,10411,The Distinguished Gentleman,en,1992-12-04,266,10.675
16273,33667,Gentleman's Agreement,en,1947-11-11,216,9.551
579082,1198994,Send Help,en,2026-01-22,214,83.0249
96777,244049,Drishyam,ml,2013-12-19,169,6.312
331712,706872,Drishyam 2,ml,2021-02-19,85,6.253
214459,469872,A Gentleman,hi,2017-08-25,70,4.297
98524,249772,1: Nenokkadine,te,2014-01-10,65,5.453
