In [1]:
import pandas as pd
import numpy as np
import ast 
import pandas as pd
import os
import shutil
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests

In [3]:
from collections import Counter

def count_genres(df, genre_col='genres'):
    genre_counter = Counter()
    
    for genre_string in df[genre_col]:
        if isinstance(genre_string, str):
            genres = genre_string.split('|')
        else:
            genres = genre_string  # Already a list
        genre_counter.update(genres)
    
    return genre_counter


In [None]:

TMDB_API_KEY = 'bf4cf184995f9d9c0fe2120c2f2113d2'  # Get it from http://www.omdbapi.com/apikey.aspx

def fetch_plot_from_tmdb(imdb_id):
    try:
        # Step 1: Get TMDB movie ID
        find_url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={TMDB_API_KEY}&external_source=imdb_id"
        find_resp = requests.get(find_url)
        if find_resp.status_code != 200:
            return ""
        movie_results = find_resp.json().get('movie_results', [])
        if not movie_results:
            return ""
        tmdb_id = movie_results[0]['id']

        # Step 2: Get movie details (plot)
        movie_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={TMDB_API_KEY}&language=en-US"
        movie_resp = requests.get(movie_url)
        if movie_resp.status_code != 200:
            return ""
        overview = movie_resp.json().get("overview", "")
        return overview.strip()
    except Exception as e:
        print(f"Error fetching {imdb_id}: {e}")
        return ""
# --- Configuration ---
EXCLUSIVE_GENRES = {'Action', 'Comedy', 'Romance', 'Adventure', 'Horror', 'Thriller', 'Crime', 'Sci-Fi', 'Drama'}
MAX_GENRE_COUNT = 3500
INPUT_CSV = 'tmdb_cleaned.csv'
POSTER_DIR = 'C:/Users/satis/Downloads/datasets/510_project_dataset/tmdb'
NEW_POSTER_DIR = 'C:/Users/satis/Downloads/datasets/510_project_dataset/final_posters'
os.makedirs(NEW_POSTER_DIR, exist_ok=True)

# --- IMDbPY Setup ---
ia = IMDb()

# --- Load Dataset ---
df = pd.read_csv(INPUT_CSV)
df['genres'] = df['genres'].apply(ast.literal_eval)

# --- Helpers ---
genre_counter = defaultdict(int)
existing_imdb_ids = set()
final_rows = []

# --- Step 1: Pre-filter rows (no plot fetching yet) ---
candidates = []
for _, row in df.iterrows():
    imdb_id = row['imdb_id']
    movie_id = row['movie_id']
    genres = set(row['genres'])

    if imdb_id in existing_imdb_ids:
        continue
    if not genres.issubset(EXCLUSIVE_GENRES):
        continue
    if any(genre_counter[g] >= MAX_GENRE_COUNT for g in genres):
        continue

    poster_path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
    if not os.path.isfile(poster_path):
        continue

    candidates.append((imdb_id, movie_id, genres))
print("after filtering: ", len(candidates))
# --- Step 2: Fetch plots in parallel ---
def fetch_valid_plot(args):
    imdb_id, movie_id, genres = args
    plot = fetch_plot_from_tmdb(imdb_id)
    if plot:
        return imdb_id, movie_id, genres, plot
    return None

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(fetch_valid_plot, item) for item in candidates]
    for future in as_completed(futures):
        result = future.result()
        if result:
            imdb_id, movie_id, genres, plot = result

            # Check genre limits again to be safe due to parallelism
            if any(genre_counter[g] >= MAX_GENRE_COUNT for g in genres):
                continue

            # Copy poster
            src = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
            dst = os.path.join(NEW_POSTER_DIR, f"{imdb_id}.jpg")
            shutil.copyfile(src, dst)

            for g in genres:
                genre_counter[g] += 1

            final_rows.append({
                'imdb_id': imdb_id,
                'plot': plot,
                'genres': '|'.join(genres)
            })
            existing_imdb_ids.add(imdb_id)

# --- Step 3: Save final dataset ---
final_df = pd.DataFrame(final_rows)
final_df.to_csv('filtered_dataset.csv', index=False)



after filtering:  7000
✅ Done. Saved 7000 datapoints to filtered_dataset.csv.


In [65]:

df2 = pd.read_csv("kaggle_cleaned.csv", encoding='ISO-8859-1')
df2['genres'] = df2['genres'].apply(ast.literal_eval)

additional_rows = []
new_candidates = []

for _, row in df2.iterrows():
    imdb_id = row['imdbId']
    movie_id = row['posterId']
    genres = set(row['genres'])

    if imdb_id in existing_imdb_ids:
        continue
    if not genres.issubset(EXCLUSIVE_GENRES):
        continue

    # Check if at least one genre still needs more datapoints
    if not any(genre_counter[g] < MAX_GENRE_COUNT for g in genres):
        continue

    poster_path = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
    if not os.path.isfile(poster_path):
        continue

    new_candidates.append((imdb_id, movie_id, genres))

print(f"Found {len(new_candidates)} candidates from df2.")

def fetch_valid_plot_tmdb_df2(args):
    imdb_id, movie_id, genres = args
    plot = fetch_plot_from_tmdb(imdb_id)
    if plot:
        return imdb_id, movie_id, genres, plot
    return None

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(fetch_valid_plot_tmdb_df2, item) for item in new_candidates]
    for future in as_completed(futures):
        result = future.result()
        if result:
            imdb_id, movie_id, genres, plot = result

            if imdb_id in existing_imdb_ids:
                continue

            # Skip if it will exceed ALL genres (we need at least one still under MAX)
            if not any(genre_counter[g] < MAX_GENRE_COUNT for g in genres):
                continue

            # Copy poster
            src = os.path.join(POSTER_DIR, f"{movie_id}.jpg")
            dst = os.path.join(NEW_POSTER_DIR, f"{imdb_id}.jpg")
            shutil.copyfile(src, dst)

            # Add to dataset
            final_rows.append({
                'imdb_id': imdb_id,
                'plot': plot,
                'genres': '|'.join(genres)
            })
            existing_imdb_ids.add(imdb_id)

            # Update only genres that are still under the limit
            for g in genres:
                if genre_counter[g] < MAX_GENRE_COUNT:
                    genre_counter[g] += 1

            # Stop early if all genres are now complete
            if all(genre_counter[g] >= MAX_GENRE_COUNT for g in EXCLUSIVE_GENRES):
                break




Found 393 candidates from df2.


In [67]:
temp = genre_counter

In [68]:
# --- Step 5: Load and process df3 which needs to fetch posters ---
df3 = pd.read_csv('imdb_cleaned.csv')
df3['genres'] = df3['genres'].apply(lambda x:x.split(","))
df3_candidates = []
for _, row in df3.iterrows():
    imdb_id = row['imdbId']
    # movie_id = row['movie_id']
    genres = set(row['genres'])

    if imdb_id in existing_imdb_ids:
        continue
    if not genres.issubset(EXCLUSIVE_GENRES):
        continue
    if not any(genre_counter[g] < MAX_GENRE_COUNT for g in genres):
        continue
    for g in genres:
        if genre_counter[g] < MAX_GENRE_COUNT:
            genre_counter[g] += 1
    dst_path = os.path.join(NEW_POSTER_DIR, f"{imdb_id}.jpg")
    df3_candidates.append((imdb_id, None if os.path.isfile(dst_path) else "DOWNLOAD", genres))


print(f"Found {len(df3_candidates)} candidates from df3 (poster will be downloaded).")
print(genre_counter)
def fetch_plot_and_poster_tmdb(args):
    imdb_id, poster_flag, genres = args
    try:
        # Step 1: Get TMDB movie ID
        find_url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={TMDB_API_KEY}&external_source=imdb_id"
        find_resp = requests.get(find_url)
        if find_resp.status_code != 200:
            return None
        movie_results = find_resp.json().get('movie_results', [])
        if not movie_results:
            return None
        movie_data = movie_results[0]
        tmdb_id = movie_data['id']
        poster_path = movie_data.get('poster_path', '')
        if not poster_path:
            return None

        # Step 2: Get plot
        movie_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={TMDB_API_KEY}&language=en-US"
        movie_resp = requests.get(movie_url)
        if movie_resp.status_code != 200:
            return None
        plot = movie_resp.json().get("overview", "").strip()
        if not plot:
            return None

        # Step 3: Download poster only if not already downloaded
        dst = os.path.join(NEW_POSTER_DIR, f"{imdb_id}.jpg")
        if poster_flag == "DOWNLOAD":
            poster_url = f"https://image.tmdb.org/t/p/w500{poster_path}"
            poster_response = requests.get(poster_url, stream=True)
            if poster_response.status_code == 200:
                with open(dst, 'wb') as out_file:
                    shutil.copyfileobj(poster_response.raw, out_file)
            else:
                return None

        return imdb_id, "tmdb_" + str(tmdb_id), genres, plot

    except Exception as e:
        print(f"Error fetching {imdb_id} from df3: {e}")
        return None

from tqdm import tqdm

# Wrap futures in tqdm progress bar
for item in tqdm(df3_candidates, desc="Processing Movies", unit="movie"):
    result = fetch_plot_and_poster_tmdb(item)
    # print(result)
    if result:
        imdb_id, movie_id, genres, plot = result

        final_rows.append({
            'imdb_id': imdb_id,
            'plot': plot,
            'genres': '|'.join(genres)
        })
    existing_imdb_ids.add(imdb_id)



Found 15754 candidates from df3 (poster will be downloaded).
defaultdict(<class 'int'>, {'Comedy': 3500, 'Action': 3500, 'Thriller': 3500, 'Crime': 3500, 'Adventure': 3500, 'Drama': 3500, 'Sci-Fi': 3500, 'Horror': 3500, 'Romance': 3500})


Processing Movies: 100%|██████████| 15754/15754 [30:52<00:00,  8.50movie/s]


In [69]:
final_df = pd.DataFrame(final_rows)
final_df.to_csv('filtered_dataset_final.csv', index=False)
print(f"✅ Final dataset saved with {len(final_df)} datapoints.")


✅ Final dataset saved with 13770 datapoints.


In [78]:
cdf= pd.read_csv("filtered_dataset.csv" , encoding = 'ISO-8859-1')
cdf['genres']= cdf['genres'].apply(lambda x: str(x).split("|"))
genre_counter = count_genres(cdf, 'genres')

In [79]:
genre_counter

Counter({'Drama': 3475,
         'Comedy': 2382,
         'Thriller': 2164,
         'Action': 1857,
         'Romance': 1450,
         'Crime': 1325,
         'Horror': 1094,
         'Sci-Fi': 816,
         'Adventure': 731,
         'nan': 15})

In [73]:
len(final_rows)

13770

In [36]:
MAX_GENRE_COUNT = 3500

In [62]:
len(cdf)

13972

In [75]:
cdf.shape

(13770, 3)

In [76]:
print(temp)

defaultdict(<class 'int'>, {'Comedy': 3500, 'Action': 3500, 'Thriller': 3500, 'Crime': 3500, 'Adventure': 3500, 'Drama': 3500, 'Sci-Fi': 3500, 'Horror': 3500, 'Romance': 3500})


In [77]:
temp

defaultdict(int,
            {'Comedy': 3500,
             'Action': 3500,
             'Thriller': 3500,
             'Crime': 3500,
             'Adventure': 3500,
             'Drama': 3500,
             'Sci-Fi': 3500,
             'Horror': 3500,
             'Romance': 3500})

In [80]:
df = pd.read_csv("imdb_cleaned.csv", encoding = "'ISO-8859-1'")
df['genres']= df['genres'].apply(lambda x: x.split("|"))

In [84]:
df = df[~df['genres'].apply(lambda x: 'Drama' in x)]


In [83]:
type(df['genres'][0])

list

In [91]:
len(df[df['genres'].apply(lambda x: "Sci-Fi" in x)])

35420