In [None]:
import os, re, time, json
import pandas as pd
import numpy as np
from datetime import datetime, timezone, date
import requests
from dotenv import load_dotenv
load_dotenv()

MERGED_PATH = "../Data/outputs/movielens_tmdb_merged.csv"
OUT_DIR = "../Data/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Target languages
TARGET_LANGS = ["en","te","ja","ko","hi","ta","ml","kn"]
# all other non target are removed from the catalog, so we can focus on the target langs and not have to worry about filtering them out later
SYNC_START_DATE = "2023-01-01"

# TMDB API base
TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_BEARER_TOKEN = os.getenv("TMDB_BEARER_TOKEN")

assert TMDB_BEARER_TOKEN is not None, "Set TMDB_BEARER_TOKEN environment variable first."

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_BEARER_TOKEN}"
}

ml_merged = pd.read_csv(MERGED_PATH, low_memory=False)
print("Loaded merged MovieLens-TMDB file:", ml_merged.shape)
ml_merged.head(2)

Loaded merged MovieLens-TMDB file: (87585, 22)


Unnamed: 0,movieId,imdbId,tmdbId,imdb_tt,id,imdb_id,title,original_language,release_date,adult,...,genres,keywords,popularity,vote_average,vote_count,clean_title,year,genres_list,tags_agg,movieDoc_full
0,1,114709,862.0,tt0114709,862.0,tt0114709,Toy Story,en,1995-10-30,False,...,"Animation, Adventure, Family, Comedy","rescue, friendship, mission, martial arts, jea...",78.404,7.971,17152.0,Toy Story,1995.0,"['Adventure', 'Animation', 'Children', 'Comedy...","['Pixar', 'animation', 'Disney', 'funny', 'Tom...",Title: Toy Story\nYear: 1995\nOriginal languag...
1,2,113497,8844.0,tt0113497,8844.0,tt0113497,Jumanji,en,1995-12-15,False,...,"Adventure, Fantasy, Family","giant insect, board game, disappearance, jungl...",13.444,7.239,9833.0,Jumanji,1995.0,"['Adventure', 'Children', 'Fantasy']","['Robin Williams', 'fantasy', 'time travel', '...",Title: Jumanji\nYear: 1995\nOriginal language:...


In [2]:
# Helper fn for limit handling

def tmdb_get(url, params=None, max_retries=5, sleep_base=1.0):
    for attempt in range(max_retries):
        r = requests.get(url, headers=headers, params=params, timeout=30)

        # Success
        if r.status_code == 200:
            return r.json()

        # Rate limit
        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            wait = float(retry_after) if retry_after else (sleep_base * (2 ** attempt))
            print(f"[429] Rate limited. Sleeping {wait:.1f}s...")
            time.sleep(wait)
            continue

        # Transient errors
        if r.status_code in (500, 502, 503, 504):
            wait = sleep_base * (2 ** attempt)
            print(f"[{r.status_code}] Server error. Sleeping {wait:.1f}s...")
            time.sleep(wait)
            continue

        # Other errors then raise
        raise RuntimeError(f"TMDB request failed: {r.status_code} {r.text}")

    raise RuntimeError("Max retries exceeded")

In [3]:
# Discover endpoint fetcher (2023+ and per-language)

def discover_movies(lang_code, start_date="2023-01-01", end_date=None, page=1, include_adult=True):
    """
    Calls /discover/movie for a given original language and date window.
    """
    if end_date is None:
        end_date = date.today().isoformat()

    url = f"{TMDB_BASE}/discover/movie"
    params = {
        "include_adult": str(include_adult).lower(),
        "include_video": "false",
        "sort_by": "primary_release_date.desc",
        "page": page,
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": lang_code,
        "language": "en-US"  # response language; doesn't affect original_language filter
    }
    return tmdb_get(url, params=params)

In [4]:
def fetch_discover_all(lang_code, start_date, end_date=None, include_adult=True, max_pages=200):
    """
    Fetch discover pages for one language, capped by max_pages to control API load.
    Returns list of dicts (movie results).
    """
    all_results = []
    page = 1
    while page <= max_pages:
        data = discover_movies(lang_code, start_date=start_date, end_date=end_date, page=page, include_adult=include_adult)
        results = data.get("results", [])
        if not results:
            break

        all_results.extend(results)

        total_pages = data.get("total_pages", page)
        if page >= total_pages:
            break

        page += 1

    return all_results

In [7]:
def fetch_movie_details(tmdb_id, include_adult=True):
    url = f"{TMDB_BASE}/movie/{tmdb_id}"
    params = {
        "language": "en-US",
        "include_adult": str(include_adult).lower(),
        "append_to_response": "keywords"
    }
    return tmdb_get(url, params=params)

In [8]:
existing_tmdb_ids = set(ml_merged["id"].dropna().astype(int).unique())
print("Existing TMDB ids from MovieLens merge:", len(existing_tmdb_ids))

all_discover = []
for lang in TARGET_LANGS:
    print(f"Fetching discover for lang={lang} from {SYNC_START_DATE}...")
    all_discover.extend(fetch_discover_all(lang, start_date=SYNC_START_DATE, include_adult=True, max_pages=200))

discover_df = pd.DataFrame(all_discover).drop_duplicates(subset=["id"])
print("Discover unique movies:", len(discover_df))

# Only keep those NOT already in MovieLens merged
# This focuses updates on new catalog items (most likely 2024+ etc.)
new_ids = [int(x) for x in discover_df["id"].dropna().unique() if int(x) not in existing_tmdb_ids]
print("New TMDB ids not in MovieLens merge:", len(new_ids))

CAP_DETAILS = 50000
new_ids = new_ids[:CAP_DETAILS]
print("Fetching details for first", len(new_ids), "new ids...")

details_list = []
for i, tid in enumerate(new_ids, 1):
    if i % 200 == 0:
        print(f"  fetched {i}/{len(new_ids)}")
    details_list.append(fetch_movie_details(tid, include_adult=True))

details_df = pd.DataFrame(details_list)
print("Details rows:", details_df.shape)
details_df.head(2)

Existing TMDB ids from MovieLens merge: 86980
Fetching discover for lang=en from 2023-01-01...
Fetching discover for lang=te from 2023-01-01...
Fetching discover for lang=ja from 2023-01-01...
Fetching discover for lang=ko from 2023-01-01...
Fetching discover for lang=hi from 2023-01-01...
Fetching discover for lang=ta from 2023-01-01...
Fetching discover for lang=ml from 2023-01-01...
Fetching discover for lang=kn from 2023-01-01...
Discover unique movies: 14137
New TMDB ids not in MovieLens merge: 14071
Fetching details for first 14071 new ids...
  fetched 200/14071
  fetched 400/14071
  fetched 600/14071
  fetched 800/14071
  fetched 1000/14071
  fetched 1200/14071
  fetched 1400/14071
  fetched 1600/14071
  fetched 1800/14071
  fetched 2000/14071
  fetched 2200/14071
  fetched 2400/14071
  fetched 2600/14071
  fetched 2800/14071
  fetched 3000/14071
  fetched 3200/14071
  fetched 3400/14071
  fetched 3600/14071
  fetched 3800/14071
  fetched 4000/14071
  fetched 4200/14071
  fetche

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
0,False,/r1KfaLz9gSxBbVQWG5dPCw8dJmo.jpg,,0,"[{'id': 27, 'name': 'Horror'}]",,1634002,,[US],en,...,0,1,[],Released,,The Bough Broke,False,0.0,0,"{'keywords': [{'id': 201769, 'name': 'observat..."
1,False,,,0,"[{'id': 99, 'name': 'Documentary'}]",,1631604,,[TH],en,...,0,16,[],Released,,Dear Mae,False,0.0,0,{'keywords': []}


In [9]:
import pandas as pd
import numpy as np
import os

def extract_keyword_names(keywords_obj):
    """
    Safely extracts keyword names from TMDB 'keywords' response.
    Handles both dict (detail view) and list formats.
    """
    if isinstance(keywords_obj, dict):
        # TMDB detail response with append_to_response="keywords" looks like:
        # {"keywords": [{"id": 1, "name": "foo"}]}
        items = keywords_obj.get("keywords", []) or keywords_obj.get("results", [])
    elif isinstance(keywords_obj, list):
        items = keywords_obj
    else:
        items = []
        
    return [k.get("name", "").strip() for k in items if isinstance(k, dict) and k.get("name")]

def build_moviedoc_from_details(row):
    # Basic Text Fields
    title = str(row.get("title", "") or "").strip()
    overview = str(row.get("overview", "") or "").strip()
    tagline = str(row.get("tagline", "") or "").strip()
    lang = str(row.get("original_language", "") or "").strip()

    # Date / Year
    release_date = str(row.get("release_date", "") or "").strip()
    year = ""
    if release_date and len(release_date) >= 4 and release_date[:4].isdigit():
        year = release_date[:4]

    # Numeric Fields
    vote_average = row.get("vote_average")
    vote_count = row.get("vote_count")
    popularity = row.get("popularity")

    # List Fields
    # Genres
    genres = row.get("genres", [])
    genres_list = [g.get("name") for g in genres if isinstance(g, dict) and g.get("name")]
    
    # Spoken Languages
    spoken = row.get("spoken_languages", [])
    spoken_list = [s.get("english_name") or s.get("name") for s in spoken if isinstance(s, dict)]
    
    # Keywords
    keywords_list = extract_keyword_names(row.get("keywords", {}))

    # Build the Doc Lines
    lines = [
        f"Title: {title}",
        f"Year: {year}" if year else None,
        f"Original language: {lang}" if lang else None,
        f"Spoken languages: {', '.join(spoken_list[:5])}" if spoken_list else None,
        f"Genres: {', '.join(genres_list[:6])}" if genres_list else None,
        f"Vote average: {float(vote_average):.2f}" if pd.notna(vote_average) else None,
        f"Vote count: {int(vote_count)}" if pd.notna(vote_count) else None,
        f"Popularity: {float(popularity):.2f}" if pd.notna(popularity) else None,
        f"Keywords: {', '.join(keywords_list[:12])}" if keywords_list else None,
        f"Tagline: {tagline}" if tagline else None,
        f"Plot: {overview}" if overview else None,
    ]
    
    # Filter out None values and join
    return "\n".join([x for x in lines if x])

# Normalize and select key columns
api_updates = details_df.copy()

# Ensure ID is numeric for merging later
api_updates["id"] = pd.to_numeric(api_updates["id"], errors="coerce")
api_updates["adult"] = api_updates["adult"].fillna(False)

print("Building movieDocs...")
api_updates["movieDoc"] = api_updates.apply(build_moviedoc_from_details, axis=1)

keep = [
    "id", "imdb_id", "title", "original_title", "original_language", "release_date",
    "vote_average", "vote_count", "popularity", "runtime", "status", "adult", 
    "overview", "tagline", "movieDoc"
]

# Ensure all 'keep' columns exist, fill missing with NaN
for c in keep:
    if c not in api_updates.columns:
        api_updates[c] = np.nan

# Final cleanup
api_updates = api_updates[keep].drop_duplicates(subset=["id"])

# Save
delta_path = os.path.join(OUT_DIR, f"tmdb_api_updates_{SYNC_START_DATE}_to_today.csv")
api_updates.to_csv(delta_path, index=False)
print("Saved delta:", delta_path, "rows:", len(api_updates))

Building movieDocs...
Saved delta: Data/outputs/tmdb_api_updates_2023-01-01_to_today.csv rows: 14071


In [None]:
# Merge Kaggle TMDB all-languages catalog with API delta

import os

base_sem_path = os.path.join(OUT_DIR, "tmdb_semantic_catalog_alllangs.csv")
tmdb_base = pd.read_csv(base_sem_path, low_memory=False)

# Load delta updates (API)
delta_path = "../Data/outputs/tmdb_api_updates_2023-01-01_to_today.csv"
tmdb_delta = pd.read_csv(delta_path, low_memory=False)

# Normalize id types for merge logic
tmdb_base["id"] = pd.to_numeric(tmdb_base["id"], errors="coerce")
tmdb_delta["id"] = pd.to_numeric(tmdb_delta["id"], errors="coerce")

# 1) Full all-languages merge: delta overwrites base where id matches
combined_all = pd.concat([tmdb_base, tmdb_delta], ignore_index=True)
combined_all = combined_all.sort_values(by=["id"]).drop_duplicates(subset=["id"], keep="last")

combined_all_path = os.path.join(OUT_DIR, "tmdb_semantic_catalog_alllangs_with_new_movies.csv")
combined_all.to_csv(combined_all_path, index=False)

# 2) Target-language subset for embedding/index workflows
combined_target = combined_all[combined_all["original_language"].isin(TARGET_LANGS)].copy()
combined_target_path = os.path.join(OUT_DIR, "tmdb_semantic_catalog_all_and_target_lang_new_movies.csv")
combined_target.to_csv(combined_target_path, index=False)

# Keep backward-compatible variable name used in later cells
combined = combined_target

print("Saved full all-languages catalog:", combined_all_path)
print("All-languages rows:", len(combined_all))
print("Saved target-language catalog:", combined_target_path)
print("Target-language rows:", len(combined_target))
print(combined_target["original_language"].value_counts())

Saved full all-languages catalog: Data/outputs/tmdb_semantic_catalog_alllangs_with_new_movies.csv
All-languages rows: 1367793
Saved target-language catalog: Data/outputs/tmdb_semantic_catalog_all_and_target_lang_new_movies.csv
Target-language rows: 845572
original_language
en    745199
ja     61485
ko     15257
hi      8935
ta      5276
ml      4467
te      3224
kn      1729
Name: count, dtype: int64


In [None]:
# Interactive search for movie titles in the TMDB dataset

import os, re, time, json
import pandas as pd
import numpy as np
from datetime import datetime, timezone, date
import requests 
import ipywidgets as widgets
from IPython.display import display, clear_output

combined_path = "../Data/outputs/tmdb_semantic_catalog_alllangs_with_new_movies.csv"

df_tmdb_c = pd.read_csv(combined_path, low_memory=False)
text_input = widgets.Text(description='Movie Title:', placeholder='Enter movie title here')
button = widgets.Button(description='Search')
output = widgets.Output()

def on_button_click(b):
    with output:
        clear_output()
        title = text_input.value.strip()
        if not title:
            print("Please enter a title to search.")
            return
        
        print(f"Searching for: '{title}'")
        
        # Exact match (case insensitive)
        exact_matches = df_tmdb_c[df_tmdb_c['title'].str.lower() == title.lower()]
        
        if not exact_matches.empty:
            print(f"\nFound {len(exact_matches)} exact match(es):")
            print(exact_matches[['id', 'title', 'release_date', 'overview']].head())
        else:
            print(f"\nNo exact match for '{title}'")
        
        # Partial match (contains the word, case insensitive)
        partial_matches = df_tmdb_c[df_tmdb_c['title'].str.lower().str.contains(title.lower(), na=False)]
        
        if not partial_matches.empty:
            print(f"\nFound {len(partial_matches)} title(s) containing '{title}':")
            print(partial_matches[['id', 'title', 'release_date']].head(10))  # Show first 10
        else:
            print(f"\nNo titles containing '{title}'")

button.on_click(on_button_click)

display(text_input, button, output)

Text(value='', description='Movie Title:', placeholder='Enter movie title here')

Button(description='Search', style=ButtonStyle())

Output()

In [7]:
# columns for reference
print("TMDB combined columns:", combined.columns.tolist())

TMDB combined columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'year', 'movieDoc']


In [8]:
# movieDoc sampels for original language "te" (Telugu) released in 2026 with status "Released"
print("\nSample movieDocs for original language 'te' (Telugu) released in 2026 with status 'Released':")
sample_te = combined[(combined["original_language"] == "te") & (combined["release_date"].str.startswith("2026")) & (combined["status"] == "Released")]["movieDoc"].dropna()
for i, doc in enumerate(sample_te, 10):
    print(f"\n--- Telugu MovieDoc Sample {i} ---\n{doc}\n")


Sample movieDocs for original language 'te' (Telugu) released in 2026 with status 'Released':

--- Telugu MovieDoc Sample 10 ---
Title: Nilakanta
Year: 2026
Original language: te
Spoken languages: Telugu
Vote average: 0.00
Vote count: 0
Popularity: 1.99
Tagline: The Essence of Karma
Plot: Saraswathipuram is a Village,where Education is Well Valued.nilakanta Work as a Tailor in That Village.nilakanta is Wise and Good From Childhood, He Believes in Karma but Unfortunately Karma Curses Him, No Matter How Big an Opponent He is,he Fights Bravely for His Life.who is That Rival ?


--- Telugu MovieDoc Sample 11 ---
Title: Funky
Year: 2026
Original language: te
Spoken languages: Telugu
Genres: Comedy, Romance, Drama
Vote average: 7.00
Vote count: 1
Popularity: 4.70
Keywords: love comedy
Plot: A debut director who falls in love with his producer's daughter


--- Telugu MovieDoc Sample 12 ---
Title: Euphoria
Year: 2026
Original language: te
Spoken languages: Telugu
Genres: Drama
Vote average: 0