In [None]:
import os, re, time, json
import pandas as pd
import numpy as np
from datetime import datetime, timezone, date
import requests
from dotenv import load_dotenv
load_dotenv()

MERGED_PATH = "Data/outputs/movielens_tmdb_merged.csv"
OUT_DIR = "Data/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Target languages
TARGET_LANGS = ["en","te","ja","ko","hi","ta","ml","kn"]

SYNC_START_DATE = "2023-01-01"

# TMDB API base
TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_BEARER_TOKEN = os.getenv("TMDB_BEARER_TOKEN")

assert TMDB_BEARER_TOKEN is not None, "Set TMDB_BEARER_TOKEN environment variable first."

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_BEARER_TOKEN}"
}

ml_merged = pd.read_csv(MERGED_PATH, low_memory=False)
print("Loaded merged MovieLens-TMDB file:", ml_merged.shape)
ml_merged.head(2)

Loaded merged MovieLens-TMDB file: (87585, 20)


Unnamed: 0,movieId,imdbId,tmdbId,imdb_tt,id,imdb_id,title,original_language,release_date,adult,movieDoc,overview,genres,keywords,spoken_languages,clean_title,year,genres_list,tags_agg,movieDoc_full
0,1,114709,862.0,tt0114709,862.0,tt0114709,Toy Story,en,1995-10-30,False,Title: Toy Story\nYear: 1995\nOriginal languag...,"Led by Woody, Andy's toys live happily in his ...","Animation, Adventure, Family, Comedy","rescue, friendship, mission, martial arts, jea...",English,Toy Story,1995.0,"['Adventure', 'Animation', 'Children', 'Comedy...","['Pixar', 'animation', 'Disney', 'funny', 'Tom...",Title: Toy Story\nYear: 1995\nOriginal languag...
1,2,113497,8844.0,tt0113497,8844.0,tt0113497,Jumanji,en,1995-12-15,False,Title: Jumanji\nYear: 1995\nOriginal language:...,When siblings Judy and Peter discover an encha...,"Adventure, Fantasy, Family","giant insect, board game, disappearance, jungl...","English, French",Jumanji,1995.0,"['Adventure', 'Children', 'Fantasy']","['Robin Williams', 'fantasy', 'time travel', '...",Title: Jumanji\nYear: 1995\nOriginal language:...


In [None]:
# Helper fn for limit handling

def tmdb_get(url, params=None, max_retries=5, sleep_base=1.0):
    for attempt in range(max_retries):
        r = requests.get(url, headers=headers, params=params, timeout=30)

        # Success
        if r.status_code == 200:
            return r.json()

        # Rate limit
        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            wait = float(retry_after) if retry_after else (sleep_base * (2 ** attempt))
            print(f"[429] Rate limited. Sleeping {wait:.1f}s...")
            time.sleep(wait)
            continue

        # Transient errors
        if r.status_code in (500, 502, 503, 504):
            wait = sleep_base * (2 ** attempt)
            print(f"[{r.status_code}] Server error. Sleeping {wait:.1f}s...")
            time.sleep(wait)
            continue

        # Other errors then raise
        raise RuntimeError(f"TMDB request failed: {r.status_code} {r.text}")

    raise RuntimeError("Max retries exceeded")

In [6]:
# Discover endpoint fetcher (2023+ and per-language)

def discover_movies(lang_code, start_date="2023-01-01", end_date=None, page=1, include_adult=True):
    """
    Calls /discover/movie for a given original language and date window.
    """
    if end_date is None:
        end_date = date.today().isoformat()

    url = f"{TMDB_BASE}/discover/movie"
    params = {
        "include_adult": str(include_adult).lower(),
        "include_video": "false",
        "sort_by": "primary_release_date.desc",
        "page": page,
        "primary_release_date.gte": start_date,
        "primary_release_date.lte": end_date,
        "with_original_language": lang_code,
        "language": "en-US"  # response language; doesn't affect original_language filter
    }
    return tmdb_get(url, params=params)

In [None]:
def fetch_discover_all(lang_code, start_date, end_date=None, include_adult=True, max_pages=200):
    """
    Fetch discover pages for one language, capped by max_pages to control API load.
    Returns list of dicts (movie results).
    """
    all_results = []
    page = 1
    while page <= max_pages:
        data = discover_movies(lang_code, start_date=start_date, end_date=end_date, page=page, include_adult=include_adult)
        results = data.get("results", [])
        if not results:
            break

        all_results.extend(results)

        total_pages = data.get("total_pages", page)
        if page >= total_pages:
            break

        page += 1

    return all_results

In [None]:
#Fetch detailed fields for new IDs
def fetch_movie_details(tmdb_id, include_adult=True):
    url = f"{TMDB_BASE}/movie/{tmdb_id}"
    params = {
        "language": "en-US",
        "include_adult": str(include_adult).lower()
    }
    return tmdb_get(url, params=params)

In [None]:
existing_tmdb_ids = set(ml_merged["id"].dropna().astype(int).unique())
print("Existing TMDB ids from MovieLens merge:", len(existing_tmdb_ids))

all_discover = []
for lang in TARGET_LANGS:
    print(f"Fetching discover for lang={lang} from {SYNC_START_DATE}...")
    all_discover.extend(fetch_discover_all(lang, start_date=SYNC_START_DATE, include_adult=True, max_pages=200))

discover_df = pd.DataFrame(all_discover).drop_duplicates(subset=["id"])
print("Discover unique movies:", len(discover_df))

# Only keep those NOT already in MovieLens merged
# This focuses updates on new catalog items (most likely 2024+ etc.)
new_ids = [int(x) for x in discover_df["id"].dropna().unique() if int(x) not in existing_tmdb_ids]
print("New TMDB ids not in MovieLens merge:", len(new_ids))

# For latency reasons, cap details fetch per run (we can increase later)
CAP_DETAILS = 5000
new_ids = new_ids[:CAP_DETAILS]
print("Fetching details for first", len(new_ids), "new ids...")

details_list = []
for i, tid in enumerate(new_ids, 1):
    if i % 200 == 0:
        print(f"  fetched {i}/{len(new_ids)}")
    details_list.append(fetch_movie_details(tid, include_adult=True))

details_df = pd.DataFrame(details_list)
print("Details rows:", details_df.shape)
details_df.head(2)

Existing TMDB ids from MovieLens merge: 86980
Fetching discover for lang=en from 2023-01-01...
Fetching discover for lang=te from 2023-01-01...
Fetching discover for lang=ja from 2023-01-01...
[500] Server error. Sleeping 1.0s...
[500] Server error. Sleeping 2.0s...
[500] Server error. Sleeping 4.0s...
[500] Server error. Sleeping 8.0s...
Fetching discover for lang=ko from 2023-01-01...
Fetching discover for lang=hi from 2023-01-01...
Fetching discover for lang=ta from 2023-01-01...
Fetching discover for lang=ml from 2023-01-01...
Fetching discover for lang=kn from 2023-01-01...
Discover unique movies: 14108
New TMDB ids not in MovieLens merge: 14042
Fetching details for first 5000 new ids...
  fetched 200/5000
  fetched 400/5000
  fetched 600/5000
  fetched 800/5000
  fetched 1000/5000
  fetched 1200/5000
  fetched 1400/5000
  fetched 1600/5000
  fetched 1800/5000
  fetched 2000/5000
  fetched 2200/5000
  fetched 2400/5000
  fetched 2600/5000
  fetched 2800/5000
  fetched 3000/5000
  

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,,,0,[],,1633068,,[US],en,...,,0,12,[],Released,bbno$ 2025 AU-NZ Tour Recap,bbno$ Australia Tour Documentary,False,0.0,0
1,False,,,0,[],,1632976,,[US],en,...,,0,1,[],Released,,A Bird Sits Atop the Spire,False,0.0,0


In [None]:
#Create movieDoc for API-fetched movies and save delta

def build_moviedoc_from_details(row):
    title = str(row.get("title", "") or "").strip()
    release_date = str(row.get("release_date", "") or "").strip()
    year = ""
    if release_date and len(release_date) >= 4 and release_date[:4].isdigit():
        year = release_date[:4]

    lang = str(row.get("original_language", "") or "").strip()
    overview = str(row.get("overview", "") or "").strip()
    tagline = str(row.get("tagline", "") or "").strip()

    # genres is list of dicts in TMDB details response
    genres = row.get("genres", [])
    genres_list = [g.get("name") for g in genres if isinstance(g, dict) and g.get("name")]

    spoken = row.get("spoken_languages", [])
    spoken_list = [s.get("english_name") or s.get("name") for s in spoken if isinstance(s, dict)]

    # keywords are not included in /movie/{id} by default (separate endpoint).
    # We'll keep empty here for simplicity and speed.
    keywords_list = []

    lines = [
        f"Title: {title}",
        f"Year: {year}" if year else None,
        f"Original language: {lang}" if lang else None,
        f"Spoken languages: {', '.join(spoken_list[:5])}" if spoken_list else None,
        f"Genres: {', '.join(genres_list[:6])}" if genres_list else None,
        f"Tagline: {tagline}" if tagline else None,
        f"Plot: {overview}" if overview else None,
    ]
    return "\n".join([x for x in lines if x])

# Normalize and select key columns
api_updates = details_df.copy()
api_updates["id"] = pd.to_numeric(api_updates["id"], errors="coerce")
api_updates["adult"] = api_updates["adult"].fillna(False)

api_updates["movieDoc"] = api_updates.apply(build_moviedoc_from_details, axis=1)

keep = [
    "id","imdb_id","title","original_title","original_language","release_date",
    "vote_average","vote_count","popularity","runtime","status","adult","overview","tagline",
    "movieDoc"
]
for c in keep:
    if c not in api_updates.columns:
        api_updates[c] = np.nan

api_updates = api_updates[keep].drop_duplicates(subset=["id"])

delta_path = os.path.join(OUT_DIR, f"tmdb_api_updates_{SYNC_START_DATE}_to_today.csv")
api_updates.to_csv(delta_path, index=False)
print("Saved delta:", delta_path, "rows:", len(api_updates))

Saved delta: Data/outputs/tmdb_api_updates_2023-01-01_to_today.csv rows: 5000


In [None]:
#Merge Kaggle TMDB target-language catalog and API delta

base_sem_path = os.path.join(OUT_DIR, "tmdb_semantic_catalog_alllangs.csv")
tmdb_base = pd.read_csv(base_sem_path, low_memory=False)

# Keep only target languages for embedding index
tmdb_base = tmdb_base[tmdb_base["original_language"].isin(TARGET_LANGS)].copy()

# Load delta updates (API)
tmdb_delta = pd.read_csv(delta_path, low_memory=False)

# Keep only target languages in delta
tmdb_delta = tmdb_delta[tmdb_delta["original_language"].isin(TARGET_LANGS)].copy()

# Combine: delta overwrite base where id matches (newer fields)
tmdb_base["id"] = pd.to_numeric(tmdb_base["id"], errors="coerce")
tmdb_delta["id"] = pd.to_numeric(tmdb_delta["id"], errors="coerce")

combined = pd.concat([tmdb_base, tmdb_delta], ignore_index=True)
combined = combined.sort_values(by=["id"]).drop_duplicates(subset=["id"], keep="last")

combined_path = os.path.join(OUT_DIR, "tmdb_targetlangs_with_api_freshness.csv")
combined.to_csv(combined_path, index=False)

print("Saved combined catalog:", combined_path)
print("Combined rows:", len(combined))
print(combined["original_language"].value_counts())

Saved combined catalog: Data/outputs/tmdb_targetlangs_with_api_freshness.csv
Combined rows: 845256
original_language
en    745708
ja     61173
ko     15154
hi      8783
ta      5149
ml      4382
te      3219
kn      1688
Name: count, dtype: int64


In [None]:
# Interactive search for movie titles in the TMDB dataset
import ipywidgets as widgets
from IPython.display import display, clear_output
df_tmdb_c = combined.copy()
text_input = widgets.Text(description='Movie Title:', placeholder='Enter movie title here')
button = widgets.Button(description='Search')
output = widgets.Output()

def on_button_click(b):
    with output:
        clear_output()
        title = text_input.value.strip()
        if not title:
            print("Please enter a title to search.")
            return
        
        print(f"Searching for: '{title}'")
        
        # Exact match (case insensitive)
        exact_matches = df_tmdb_c[df_tmdb_c['title'].str.lower() == title.lower()]
        
        if not exact_matches.empty:
            print(f"\nFound {len(exact_matches)} exact match(es):")
            print(exact_matches[['id', 'title', 'release_date', 'overview']].head())
        else:
            print(f"\nNo exact match for '{title}'")
        
        # Partial match (contains the word, case insensitive)
        partial_matches = df_tmdb_c[df_tmdb_c['title'].str.lower().str.contains(title.lower(), na=False)]
        
        if not partial_matches.empty:
            print(f"\nFound {len(partial_matches)} title(s) containing '{title}':")
            print(partial_matches[['id', 'title', 'release_date']].head(10))  # Show first 10
        else:
            print(f"\nNo titles containing '{title}'")

button.on_click(on_button_click)

display(text_input, button, output)



Text(value='', description='Movie Title:', placeholder='Enter movie title here')

Button(description='Search', style=ButtonStyle())

Output()

In [None]:
# columns for reference
print("TMDB combined columns:", combined.columns.tolist())

TMDB combined columns: ['id', 'imdb_id', 'title', 'original_title', 'original_language', 'release_date', 'year', 'vote_average', 'vote_count', 'popularity', 'runtime', 'status', 'adult', 'overview', 'tagline', 'genres', 'keywords', 'spoken_languages', 'movieDoc']


In [None]:
# movieDoc sampels for original language "te" (Telugu) released in 2026 with status "Released"
print("\nSample movieDocs for original language 'te' (Telugu) released in 2026 with status 'Released':")
sample_te = combined[(combined["original_language"] == "te") & (combined["release_date"].str.startswith("2026")) & (combined["status"] == "Released")]["movieDoc"].dropna()
for i, doc in enumerate(sample_te, 10):
    print(f"\n--- Telugu MovieDoc Sample {i} ---\n{doc}\n")


Sample movieDocs for original language 'te' (Telugu) released in 2026 with status 'Released':

--- Telugu MovieDoc Sample 10 ---
Title: The Rajasaab
Year: 2026
Original language: te
Spoken languages: Telugu
Genres: Comedy, Horror, Fantasy
Plot: Seeking his lost grandfather, a young man enters a haunted mansion and awakens a family curse powerful enough to upend both his life and reality.


--- Telugu MovieDoc Sample 11 ---
Title: Cheekatilo
Year: 2026
Original language: te
Spoken languages: Telugu
Genres: Thriller
Plot: When crime anchor Sandhya’s best friend is found dead under suspicious circumstances, she embarks on a dangerous investigation that collides with a dark past. As secrets unravel, Sandhya must face her trauma and rise as a fearless voice for the silenced.


--- Telugu MovieDoc Sample 12 ---
Title: Naari Naari Naduma Murari
Year: 2026
Original language: te
Spoken languages: Telugu
Genres: Romance, Drama, Comedy
Plot: A young man's life turns upside down when he finds him