# IMDb 03 Query + Soft Multilingual Debias

Objective:
- Retrieve dense candidates from FAISS.
- Apply soft language-aware reranking to reduce English/Hollywood bias.
- Compare language mix before vs after rerank.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.13.2


In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Optional

import faiss
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer


def first_existing_path(candidates: list[Path]) -> Optional[Path]:
    for cand in candidates:
        if cand.exists():
            return cand
    return None


IMDB_OUT_DIR_CANDIDATES = [
    Path('/content/drive/MyDrive/cinematch/outputs/imdb'),
    Path('/Users/roop/Library/CloudStorage/OneDrive-UniversityofFlorida/Courses/Spring 26 EGN6933/Code/CineMatch/Data/outputs/imdb'),
    Path('Data/outputs/imdb'),
]

TMDB_OUT_DIR_CANDIDATES = [
    Path('/content/drive/MyDrive/cinematch/outputs'),
    Path('/Users/roop/Library/CloudStorage/OneDrive-UniversityofFlorida/Courses/Spring 26 EGN6933/Code/CineMatch/Data/outputs'),
    Path('Data/outputs'),
]

IMDB_OUT_DIR = first_existing_path(IMDB_OUT_DIR_CANDIDATES)
TMDB_OUT_DIR = first_existing_path(TMDB_OUT_DIR_CANDIDATES)

if IMDB_OUT_DIR is None:
    raise FileNotFoundError(
        'Could not locate IMDb output directory. Tried: ' + ', '.join(str(p) for p in IMDB_OUT_DIR_CANDIDATES)
    )

IMDB_FAISS_PATH = IMDB_OUT_DIR / 'imdb_movies_bge_m3_flatip.faiss'
IMDB_META_PATH = IMDB_OUT_DIR / 'imdb_movies_meta.csv'

TMDB_FAISS_PATH = (TMDB_OUT_DIR / 'tmdb_labse_targetlangs.faiss') if TMDB_OUT_DIR is not None else None
TMDB_META_PATH = (TMDB_OUT_DIR / 'tmdb_meta_targetlangs.csv') if TMDB_OUT_DIR is not None else None

IMDB_MODEL_ID = 'BAAI/bge-m3'
TMDB_MODEL_ID = 'sentence-transformers/LaBSE'

assert IMDB_FAISS_PATH.exists(), f'Missing IMDb FAISS index: {IMDB_FAISS_PATH}'
assert IMDB_META_PATH.exists(), f'Missing IMDb metadata: {IMDB_META_PATH}'

imdb_index = faiss.read_index(str(IMDB_FAISS_PATH))
imdb_meta = pd.read_csv(IMDB_META_PATH, low_memory=False).set_index('row_id')


if 'origin_lang_bucket' in imdb_meta.columns:
    LANG_BUCKET_COL = 'origin_lang_bucket'
elif 'lang_bucket' in imdb_meta.columns:
    LANG_BUCKET_COL = 'lang_bucket'
else:
    LANG_BUCKET_COL = 'lang_bucket'
    imdb_meta[LANG_BUCKET_COL] = 'unknown'

if 'tmdb_original_language' not in imdb_meta.columns:
    imdb_meta['tmdb_original_language'] = ''
    print('[WARNING] tmdb_original_language column not found in metadata.')
    print('          Re-run Notebook 5 (IMDB_01_Prep) to generate it.')
    print('          Falling back to origin_lang_bucket only.')

tmdb_available = bool(
    TMDB_FAISS_PATH is not None
    and TMDB_META_PATH is not None
    and TMDB_FAISS_PATH.exists()
    and TMDB_META_PATH.exists()
)

if tmdb_available:
    tmdb_index = faiss.read_index(str(TMDB_FAISS_PATH))
    tmdb_meta = pd.read_csv(TMDB_META_PATH, low_memory=False)
    tmdb_meta['id'] = pd.to_numeric(tmdb_meta['id'], errors='coerce').astype('Int64')
    tmdb_meta = tmdb_meta.dropna(subset=['id']).copy()
    tmdb_meta['id'] = tmdb_meta['id'].astype('int64')
    tmdb_meta = tmdb_meta.drop_duplicates(subset=['id']).set_index('id')
else:
    tmdb_index = None
    tmdb_meta = pd.DataFrame()

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

imdb_model = SentenceTransformer(IMDB_MODEL_ID, device=device)
tmdb_model = SentenceTransformer(TMDB_MODEL_ID, device=device) if tmdb_available else None

print('IMDb out dir:', IMDB_OUT_DIR)
print('Loaded IMDb index ntotal:', imdb_index.ntotal)
print('Loaded IMDb meta rows:', len(imdb_meta))
print('IMDb language bucket column:', LANG_BUCKET_COL)

bucket_counts = imdb_meta[LANG_BUCKET_COL].value_counts()
print(f'\n--- {LANG_BUCKET_COL} distribution ---')
for bucket, count in bucket_counts.items():
    print(f'  {bucket}: {count}')

if 'tmdb_original_language' in imdb_meta.columns:
    tmdb_lang_counts = imdb_meta['tmdb_original_language'].value_counts()
    target_langs_in_tmdb = {lang: tmdb_lang_counts.get(lang, 0) for lang in ['te', 'ta', 'hi', 'ja', 'ko', 'en']}
    print(f'\n--- tmdb_original_language for target languages ---')
    for lang, count in target_langs_in_tmdb.items():
        print(f'  {lang}: {count}')

print('\nTMDB available:', tmdb_available)
if tmdb_available:
    print('TMDB out dir:', TMDB_OUT_DIR)
    print('Loaded TMDB index ntotal:', tmdb_index.ntotal)
    print('Loaded TMDB meta rows:', len(tmdb_meta))
print('Device:', device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]



Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/LaBSE
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

IMDb out dir: /content/drive/MyDrive/cinematch/outputs/imdb
Loaded IMDb index ntotal: 737654
Loaded IMDb meta rows: 737654
IMDb language bucket column: origin_lang_bucket

--- origin_lang_bucket distribution ---
  en: 405045
  other_non_en: 210600
  ja: 76366
  unknown: 24527
  ko: 8825
  hi: 6495
  ta: 3386
  te: 2410

--- tmdb_original_language for target languages ---
  te: 2410
  ta: 3386
  hi: 6495
  ja: 16737
  ko: 3991
  en: 160190

TMDB available: True
TMDB out dir: /content/drive/MyDrive/cinematch/outputs
Loaded TMDB index ntotal: 845085
Loaded TMDB meta rows: 845085
Device: cuda


In [None]:
N_CANDIDATES       = 5000
TMDB_N_CANDIDATES  = 2000
TOP_K              = 30
RELEVANCE_GATE     = 0.20

NON_ENGLISH_GLOBAL_BOOST = 0.02

LANG_BOOSTS = {
    'te': 0.03,
    'ta': 0.02,
    'hi': 0.02,
    'ja': 0.03,
    'ko': 0.03,
    'other_non_en': 0.00,
    'en': 0.00,
    'unknown': 0.00,
}

MIN_VOTES    = 500
MIN_RATING   = 5.0

MIN_VOTES_NON_EN  = 200
MIN_RATING_NON_EN = 4.0

QUALITY_WEIGHT = 0.20

TARGET_BUCKETS = ['te', 'hi', 'ta', 'ja', 'ko', 'other_non_en', 'en', 'unknown']

TMDB_LANG_TO_BUCKET = {
    'te': 'te', 'ta': 'ta', 'hi': 'hi',
    'ja': 'ja', 'ko': 'ko', 'en': 'en',
}

BENCHMARK_QUERIES = [
    'dark psychological revenge thriller with a major twist',
    'heartwarming family drama about sacrifice and hope',
    'stylized action film with revenge and moral conflict',
    'romantic story with social barriers and emotional ending',
]



def encode_query_imdb(query: str) -> np.ndarray:
    return imdb_model.encode(
        [query], convert_to_numpy=True, normalize_embeddings=True
    ).astype('float32')


def encode_query_tmdb(query: str) -> np.ndarray:
    if tmdb_model is None:
        raise RuntimeError('TMDB model not available.')
    return tmdb_model.encode(
        [query], convert_to_numpy=True, normalize_embeddings=True
    ).astype('float32')


def resolve_lang_bucket(row: pd.Series) -> str:

    tmdb_lang = str(row.get('tmdb_original_language', '')).strip()
    if tmdb_lang:
        if tmdb_lang in ('te', 'ta', 'hi', 'ja', 'ko'):
            return tmdb_lang
        elif tmdb_lang == 'en':
            return 'en'
        else:
            return 'other_non_en'

    bucket = row.get(LANG_BUCKET_COL, 'unknown')
    return bucket if bucket else 'unknown'


def compute_quality_score(df: pd.DataFrame) -> pd.Series:
    """
    Bayesian-style quality score from numVotes and averageRating.
    Normalises both to [0, 1] then combines them.
    """
    votes_col  = next((c for c in ['numVotes', 'vote_count']  if c in df.columns), None)
    rating_col = next((c for c in ['averageRating', 'vote_average'] if c in df.columns), None)

    if votes_col is None or rating_col is None:
        return pd.Series(0.0, index=df.index)

    votes  = pd.to_numeric(df[votes_col],  errors='coerce').fillna(0).clip(lower=0)
    rating = pd.to_numeric(df[rating_col], errors='coerce').fillna(0).clip(0, 10)

    log_votes = np.log1p(votes)
    norm_votes  = log_votes  / (log_votes.max()  + 1e-9)
    norm_rating = rating / 10.0

    return (0.5 * norm_rating + 0.5 * norm_votes).astype('float64')


def apply_quality_filter(
    df: pd.DataFrame,
    min_votes: int = MIN_VOTES,
    min_rating: float = MIN_RATING,
) -> pd.DataFrame:
    """Remove films that fail minimum vote/rating thresholds."""
    votes_col  = next((c for c in ['numVotes', 'vote_count']  if c in df.columns), None)
    rating_col = next((c for c in ['averageRating', 'vote_average'] if c in df.columns), None)

    mask = pd.Series(True, index=df.index)
    if votes_col:
        mask &= pd.to_numeric(df[votes_col], errors='coerce').fillna(0) >= min_votes
    if rating_col:
        mask &= pd.to_numeric(df[rating_col], errors='coerce').fillna(0) >= min_rating

    filtered = df[mask]
    if len(filtered) == 0:
        print(f'[quality_filter] No results passed thresholds (votes>={min_votes}, rating>={min_rating}) — returning unfiltered top-50.')
        return df.head(50)
    return filtered


def base_dense_search_imdb(query: str, n_candidates: int = N_CANDIDATES) -> pd.DataFrame:
    qv = encode_query_imdb(query)
    scores, row_ids = imdb_index.search(qv, int(n_candidates))

    hits = pd.DataFrame({'row_id': row_ids[0], 'faiss_score': scores[0]})
    hits = hits[hits['row_id'] >= 0].copy()
    hits = hits.join(imdb_meta, on='row_id', how='left')

    hits['lang_bucket'] = hits.apply(resolve_lang_bucket, axis=1)
    return hits


def inject_target_lang_candidates(
    query: str,
    hits: pd.DataFrame,
    lang_bucket: str = 'te',
    n_inject: int = 200,
) -> pd.DataFrame:

    qv = encode_query_imdb(query)
    extra_n = min(imdb_index.ntotal, N_CANDIDATES * 10)
    scores, row_ids = imdb_index.search(qv, int(extra_n))

    extra = pd.DataFrame({'row_id': row_ids[0], 'faiss_score': scores[0]})
    extra = extra[extra['row_id'] >= 0].copy()
    extra = extra.join(imdb_meta, on='row_id', how='left')

    extra['lang_bucket'] = extra.apply(resolve_lang_bucket, axis=1)

    lang_hits = extra[extra['lang_bucket'] == lang_bucket].head(n_inject)

    n_found = len(lang_hits)
    if n_found == 0:
        print(f'[inject] WARNING: Found 0 candidates for lang_bucket={lang_bucket!r} '
              f'even after searching {extra_n} entries.')
        if 'lang_bucket' in extra.columns:
            bucket_counts = extra['lang_bucket'].value_counts().head(10)
            print(f'[inject] Top buckets in search pool:\n{bucket_counts}')
    else:
        print(f'[inject] Injected {n_found} {lang_bucket!r} candidates into pool.')

    combined = pd.concat([hits, lang_hits], ignore_index=True)
    combined = combined.drop_duplicates(subset='row_id')
    return combined


def apply_soft_debias(
    hits: pd.DataFrame,
    relevance_gate: float = RELEVANCE_GATE,
    lang_boosts: dict = LANG_BOOSTS,
    non_english_global_boost: float = NON_ENGLISH_GLOBAL_BOOST,
    lang_col: str = 'lang_bucket',
) -> pd.DataFrame:
    reranked = hits.copy()
    reranked['lang_boost'] = reranked[lang_col].map(lang_boosts).fillna(0.0).astype('float64')
    reranked['non_english_boost'] = np.where(
        reranked[lang_col].isin(['en', 'unknown']), 0.0, float(non_english_global_boost)
    ).astype('float64')

    reranked['faiss_score'] = reranked['faiss_score'].astype('float64')

    reranked['quality_score'] = compute_quality_score(reranked)
    reranked['final_score'] = (
        (1.0 - QUALITY_WEIGHT) * reranked['faiss_score']
        + QUALITY_WEIGHT * reranked['quality_score']
    )

    gate_mask = reranked['faiss_score'] >= float(relevance_gate)
    reranked.loc[gate_mask, 'final_score'] = (
        reranked.loc[gate_mask, 'final_score']
        + reranked.loc[gate_mask, 'lang_boost']
        + reranked.loc[gate_mask, 'non_english_boost']
    ).astype('float64')

    reranked = reranked.sort_values(['final_score', 'faiss_score'], ascending=False)
    return reranked


def search_imdb_movies(
    query: str,
    top_k: int = TOP_K,
    n_candidates: int = N_CANDIDATES,
    rerank: bool = True,
    inject_langs: list[str] | None = ['te'],
) -> pd.DataFrame:
    hits = base_dense_search_imdb(query, n_candidates=n_candidates)

    if inject_langs:
        for lang in inject_langs:
            hits = inject_target_lang_candidates(query, hits, lang_bucket=lang)

    is_non_en = hits['lang_bucket'].isin(['te', 'ta', 'hi', 'ja', 'ko', 'other_non_en'])
    en_pool = hits[~is_non_en]
    non_en_pool = hits[is_non_en]

    en_filtered = apply_quality_filter(en_pool, min_votes=MIN_VOTES, min_rating=MIN_RATING)
    non_en_filtered = apply_quality_filter(non_en_pool, min_votes=MIN_VOTES_NON_EN, min_rating=MIN_RATING_NON_EN)

    hits = pd.concat([en_filtered, non_en_filtered], ignore_index=True)

    if rerank:
        hits = apply_soft_debias(hits)
    else:
        hits['lang_boost']        = 0.0
        hits['non_english_boost'] = 0.0
        hits['quality_score']     = compute_quality_score(hits)
        hits['final_score']       = hits['faiss_score']

    cols = [
        'row_id', 'tconst', 'primaryTitle', 'startYear', 'genres',
        'lang_bucket', 'faiss_score', 'quality_score',
        'lang_boost', 'non_english_boost', 'final_score',
    ]
    available = [c for c in cols if c in hits.columns]
    return hits.head(int(top_k))[available]


def search_tmdb_movies(
    query: str,
    top_k: int = TOP_K,
    n_candidates: int = TMDB_N_CANDIDATES,
    rerank: bool = True,
) -> pd.DataFrame:
    if not tmdb_available:
        return pd.DataFrame(
            columns=['id', 'title', 'original_language', 'release_date',
                     'vote_average', 'vote_count', 'popularity', 'tmdb_score']
        )

    qv = encode_query_tmdb(query)
    scores, ids = tmdb_index.search(qv, int(n_candidates))

    hits = pd.DataFrame({'id': ids[0], 'tmdb_score': scores[0]})
    hits = hits[hits['id'] >= 0].copy()
    hits = hits.join(tmdb_meta, on='id', how='left')
    hits['tmdb_score'] = hits['tmdb_score'].astype('float64')

    hits['lang_bucket'] = hits['original_language'].map(TMDB_LANG_TO_BUCKET).fillna('other_non_en')
    hits.loc[hits['original_language'] == 'en', 'lang_bucket'] = 'en'

    # Quality filter with relaxed thresholds for non-English
    is_non_en = hits['lang_bucket'].isin(['te', 'ta', 'hi', 'ja', 'ko', 'other_non_en'])
    en_pool = hits[~is_non_en]
    non_en_pool = hits[is_non_en]
    en_filtered = apply_quality_filter(en_pool, min_votes=MIN_VOTES, min_rating=MIN_RATING)
    non_en_filtered = apply_quality_filter(non_en_pool, min_votes=MIN_VOTES_NON_EN, min_rating=MIN_RATING_NON_EN)
    hits = pd.concat([en_filtered, non_en_filtered], ignore_index=True)

    if rerank:
        hits = hits.rename(columns={'tmdb_score': 'faiss_score'})
        hits['quality_score'] = compute_quality_score(hits)
        hits = apply_soft_debias(hits, lang_col='lang_bucket')
        hits = hits.rename(columns={
            'faiss_score': 'tmdb_score',
            'final_score': 'tmdb_final_score'
        })
        hits = hits.sort_values('tmdb_final_score', ascending=False)
    else:
        hits['quality_score'] = compute_quality_score(hits)
        hits = hits.sort_values('tmdb_score', ascending=False)

    cols = ['id', 'title', 'original_language', 'lang_bucket', 'release_date',
            'vote_average', 'vote_count', 'popularity', 'tmdb_score', 'quality_score']
    if rerank:
        cols += ['lang_boost', 'non_english_boost', 'tmdb_final_score']
    available = [c for c in cols if c in hits.columns]
    return hits.head(int(top_k))[available]


def recommend_from_both(
    query: str,
    imdb_top_k: int = 10,
    tmdb_top_k: int = 10,
    rerank_imdb: bool = True,
    rerank_tmdb: bool = True,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    imdb_hits = search_imdb_movies(query, top_k=imdb_top_k, rerank=rerank_imdb)
    tmdb_hits = search_tmdb_movies(query, top_k=tmdb_top_k, rerank=rerank_tmdb)
    return imdb_hits, tmdb_hits


def search_movies(query, top_k=TOP_K, n_candidates=N_CANDIDATES, rerank=True):
    return search_imdb_movies(query=query, top_k=top_k, n_candidates=n_candidates, rerank=rerank)


def language_mix(df: pd.DataFrame, top_n: int = 20) -> dict:
    top = df.head(top_n)
    if 'lang_bucket' in top.columns:
        lang_series = top['lang_bucket'].fillna('unknown')
    elif 'original_language' in top.columns:
        lang_series = top['original_language'].map(TMDB_LANG_TO_BUCKET).fillna('other_non_en')
        lang_series = lang_series.where(top['original_language'] != 'en', 'en')
    else:
        lang_series = pd.Series(['unknown'] * len(top))

    counts = lang_series.value_counts().to_dict()
    return {bucket: int(counts.get(bucket, 0)) for bucket in TARGET_BUCKETS}


## Example Search


In [None]:
example_query = 'Suggest stylish action thrillers featuring undercover agents infiltrating crime syndicates, layered plot twists, and a central object driving the conflict'
search_movies(example_query, top_k=10, rerank=True)


[inject] Injected 200 'te' candidates into pool.


Unnamed: 0,row_id,tconst,primaryTitle,startYear,genres,lang_bucket,faiss_score,quality_score,lang_boost,non_english_boost,final_score
3,306377,tt1375666,Inception,2010.0,"Action,Adventure,Sci-Fi",en,0.558754,0.94,0.0,0.0,0.635004
680,445243,tt2625030,New World,2013.0,"Action,Crime,Drama",ko,0.546177,0.720721,0.03,0.02,0.631086
2,194547,tt0407887,The Departed,2006.0,"Crime,Drama,Thriller",en,0.558999,0.905031,0.0,0.0,0.628205
696,629620,tt4987556,Thani Oruvan,2015.0,"Action,Crime,Drama",ta,0.534065,0.759821,0.02,0.02,0.619217
747,186151,tt0376127,Anniyan,2005.0,"Action,Crime,Drama",ta,0.516836,0.762576,0.02,0.02,0.605984
860,320116,tt1436045,13 Assassins,2010.0,"Action,Adventure,Drama",ja,0.505171,0.751539,0.03,0.02,0.604445
699,576744,tt3779028,Inside Men,2015.0,"Action,Crime,Drama",ko,0.532973,0.634056,0.03,0.02,0.603189
726,470154,tt28362963,Checkmate,2023.0,"Action,Comedy",hi,0.520117,0.71586,0.02,0.02,0.599266
677,288259,tt12929738,London Confidential,2020.0,"Crime,Drama,Thriller",hi,0.553,0.575395,0.02,0.02,0.597479
695,677725,tt6836936,Saaho,2019.0,"Action,Thriller",te,0.534346,0.598219,0.03,0.02,0.59712


## IMDb + TMDB Side-by-Side Recommendations


In [28]:
from IPython.display import Markdown, display

comparison_query = 'super hero  '
imdb_hits, tmdb_hits = recommend_from_both(comparison_query, imdb_top_k=10, tmdb_top_k=10, rerank_imdb=True)

display(Markdown('### IMDb (BGE-M3 + debias rerank)'))
display(imdb_hits)

display(Markdown('### TMDB (LaBSE baseline)'))
if tmdb_available:
    display(tmdb_hits)
else:
    print('TMDB files not found. Expected:', TMDB_FAISS_PATH, 'and', TMDB_META_PATH)


[inject] Injected 200 'te' candidates into pool.


### IMDb (BGE-M3 + debias rerank)

Unnamed: 0,row_id,tconst,primaryTitle,startYear,genres,lang_bucket,faiss_score,quality_score,lang_boost,non_english_boost,final_score
851,660273,tt6129302,Bhavesh Joshi Superhero,2018.0,"Action,Drama",hi,0.557484,0.692058,0.02,0.02,0.624399
856,325398,tt14614892,Dragon Ball Super: Super Hero,2022.0,"Action,Adventure,Animation",ja,0.545774,0.683276,0.03,0.02,0.623275
843,706475,tt8202076,Super Hero,2018.0,"Action,Adventure,Thriller",other_non_en,0.606827,0.57784,0.0,0.02,0.621029
854,346094,tt15600222,An Action Hero,2022.0,"Action,Comedy,Crime",hi,0.547444,0.698351,0.02,0.02,0.617625
845,242113,tt10709232,Hero,2019.0,"Action,Drama,Thriller",ta,0.572147,0.581269,0.02,0.02,0.613972
3,655605,tt5950044,Superman,2025.0,"Action,Adventure,Sci-Fi",en,0.570028,0.785157,0.0,0.0,0.613054
12,419078,tt2245084,Big Hero 6,2014.0,"Action,Adventure,Animation",en,0.553487,0.831715,0.0,0.0,0.609133
8,56563,tt0078346,Superman,1978.0,"Action,Adventure,Sci-Fi",en,0.561073,0.77908,0.0,0.0,0.604674
31,219200,tt0848228,The Avengers,2012.0,"Action,Sci-Fi",en,0.535088,0.876399,0.0,0.0,0.60335
862,242668,tt10737918,Shin Ultraman,2022.0,"Action,Adventure,Drama",ja,0.541948,0.598537,0.03,0.02,0.603266


### TMDB (LaBSE baseline)

Unnamed: 0,id,title,original_language,lang_bucket,release_date,vote_average,vote_count,popularity,tmdb_score,quality_score,lang_boost,non_english_boost,tmdb_final_score
54,610150,Dragon Ball Super: Super Hero,ja,ja,2022-06-11,7.917,2552,85.407,0.372151,0.790923,0.03,0.02,0.505906
3,177572,Big Hero 6,en,en,2014-10-24,7.738,14732,98.41,0.403943,0.870245,0.0,0.0,0.497204
5,557,Spider-Man,en,en,2002-05-01,7.279,17653,63.478,0.387282,0.856403,0.0,0.0,0.481106
55,34433,Dragon Ball Z: Broly – The Legendary Super Saiyan,ja,ja,1993-03-06,7.23,798,54.544,0.36325,0.698072,0.03,0.02,0.480214
4,209112,Batman v Superman: Dawn of Justice,en,en,2016-03-23,5.952,17081,78.589,0.394275,0.788395,0.0,0.0,0.473099
11,634649,Spider-Man: No Way Home,en,en,2021-12-15,7.99,18299,186.065,0.364756,0.893763,0.0,0.0,0.470558
57,15766,"Ghidorah, the Three-Headed Monster",ja,ja,1964-12-20,7.1,204,16.145,0.352805,0.623065,0.03,0.02,0.456857
53,32740,Krrish,hi,hi,2006-06-23,6.378,209,12.601,0.373903,0.588179,0.02,0.02,0.456758
16,1771,Captain America: The First Avenger,en,en,2011-07-22,6.995,20269,43.665,0.358516,0.849162,0.0,0.0,0.456645
1,11918,Superhero Movie,en,en,2008-03-28,5.231,1643,19.445,0.407425,0.634458,0.0,0.0,0.452832
