In [None]:
import requests
import csv
import time
from collections import defaultdict

API_KEY = 'a7974b054b97d6234399064cadb2f05f'
BASE_URL = 'https://api.themoviedb.org/3'
DISCOVER_ENDPOINT = f'{BASE_URL}/discover/movie'
GENRE_ENDPOINT = f'{BASE_URL}/genre/movie/list'
IMAGE_BASE_URL = 'https://image.tmdb.org/t/p/w500'
MIN_PER_GENRE = 1000
MAX_PER_GENRE = 2000
DATE_CHUNKS = [('2000-01-01', '2010-12-31'), ('2011-01-01', '2020-12-31'), ('2021-01-01', '2024-12-31')]
MAX_PAGES = 500

def get_genre_mapping():
    response = requests.get(GENRE_ENDPOINT, params={'api_key': API_KEY})
    if response.status_code == 200:
        genres = response.json().get('genres', [])
        return {genre['id']: genre['name'] for genre in genres}
    else:
        return {}

def fetch_balanced_dataset():
    genre_mapping = get_genre_mapping()
    if not genre_mapping:
        return

    genre_tag_counts = {gid: 0 for gid in genre_mapping}
    seen_ids = set()
    collected_movies = []

    def sort_priority(gids):
        return sorted(gids, key=lambda gid: genre_tag_counts[gid])

    for start_date, end_date in DATE_CHUNKS:

        for page in range(1, MAX_PAGES + 1):
            params = {
                'api_key': API_KEY,
                'vote_count.gte': 50,
                'sort_by': 'popularity.asc',
                'page': page,
                'primary_release_date.gte': start_date,
                'primary_release_date.lte': end_date
            }
            try:
                response = requests.get(DISCOVER_ENDPOINT, params=params, timeout=10)
                response.raise_for_status()
            except:
                break

            movies = response.json().get('results', [])
            if not movies:
                break

            for m in movies:
                movie_id = m.get('id')
                title = (m.get('title') or '').strip()
                overview = (m.get('overview') or '').strip()
                genre_ids = m.get('genre_ids', [])
                poster_path = m.get('poster_path')

                if not (movie_id and title and overview and genre_ids and poster_path):
                    continue
                if movie_id in seen_ids:
                    continue

                valid_gids = [gid for gid in genre_ids if gid in genre_mapping]
                if not valid_gids:
                    continue

                prioritized_gids = sort_priority(valid_gids)
                under_min_gids = [gid for gid in prioritized_gids if genre_tag_counts[gid] < MIN_PER_GENRE]
                selected_gids = under_min_gids if under_min_gids else [gid for gid in prioritized_gids if genre_tag_counts[gid] < MAX_PER_GENRE]

                if not selected_gids:
                    continue

                collected_movies.append({
                    'id': movie_id,
                    'title': title,
                    'overview': overview,
                    'genre_ids': selected_gids,
                    'poster_url': f"{IMAGE_BASE_URL}{poster_path}"
                })
                seen_ids.add(movie_id)

                for gid in selected_gids:
                    genre_tag_counts[gid] += 1

            time.sleep(0.25)

    final_genres = {gid for gid, count in genre_tag_counts.items() if count >= MIN_PER_GENRE}
    for gid in sorted(final_genres):
        print(f"{genre_mapping[gid]}: {genre_tag_counts[gid]}")

    with open('movies_dataset.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['id', 'title', 'overview', 'genres', 'poster_url'])

        for movie in collected_movies:
            filtered_genres = [genre_mapping[gid] for gid in movie['genre_ids'] if gid in final_genres]
            if not filtered_genres:
                continue
            writer.writerow([movie['id'], movie['title'], movie['overview'], ', '.join(filtered_genres), movie['poster_url']])

    for gid in final_genres:
        print(f"{genre_mapping[gid]}: {genre_tag_counts[gid]}")

if __name__ == '__main__':
    fetch_balanced_dataset()


In [None]:
import pandas as pd

df = pd.read_csv('strictly_balanced_movies.csv')
print(df.head())


In [None]:
print(df.isnull().sum())

missing_rows = df[df.isnull().any(axis=1) | (df == '').any(axis=1)]
print("rows with missing data:")
print(missing_rows)


In [None]:
print(f"Total movies: {len(df)}")
print(df.info())


In [None]:
from collections import Counter

all_genres = df['genres'].dropna().apply(lambda x: [g.strip() for g in x.split(',')])
genre_counts = Counter([genre for sublist in all_genres for genre in sublist])

print(f"unique genres: {len(genre_counts)}")
print("genre distribution:")
for genre, count in genre_counts.most_common():
    print(f"{genre}: {count}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.bar(genre_counts.keys(), genre_counts.values())
plt.xticks(rotation=45, ha='right')
plt.title('Genre Distribution')
plt.ylabel('Number of Movies')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import requests
import os
from tqdm import tqdm

IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"
POSTER_DIR = "movie_posters"

df = pd.read_csv("strictly_balanced_movies.csv")

os.makedirs(POSTER_DIR, exist_ok=True)

for idx, row in tqdm(df.iterrows(), total=len(df), desc="📥 Downloading Posters"):
    poster_path = row['poster_url']
    movie_id = row['id']

    if pd.notna(poster_path) and poster_path.startswith(IMAGE_BASE_URL):
        try:
            response = requests.get(poster_path, timeout=10)
            if response.status_code == 200:
                with open(os.path.join(POSTER_DIR, f"{movie_id}.jpg"), "wb") as f:
                    f.write(response.content)
            else:
                print(f"fail")
        except Exception as e:
            print("error")
