In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

driver = webdriver.Chrome()

movies = []

for start in [1, 51]:  
    url = (
        "https://www.imdb.com/search/title/"
        "?groups=top_100&sort=user_rating,desc"
        f"&start={start}"
    )

    driver.get(url)

    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")
        )
    )

    soup = BeautifulSoup(driver.page_source, "html.parser")

    for card in soup.select("li.ipc-metadata-list-summary-item"):
        title = card.select_one("h3.ipc-title__text")
        link = card.select_one("a.ipc-title-link-wrapper")

        if link:
            movies.append({
                "title": title.get_text(strip=True) if title else None,
                "movie_url": "https://www.imdb.com" + link["href"].split("?")[0]
            })

print(f"Movies found: {len(movies)}")


Movies found: 100


In [None]:
movie_rows = []
cast_rows = []

for movie in movies:
    driver.get(movie["movie_url"])
    time.sleep(2)  

    soup = BeautifulSoup(driver.page_source, "html.parser")

    imdb_id = movie["movie_url"].split("/")[4]

    # Rating
    rating_tag = soup.select_one("span.ipc-rating-star--rating")
    rating = rating_tag.get_text(strip=True) if rating_tag else None


    # Genres
   
    genre_tags = soup.select(
    'a[href^="/interest/"] span.ipc-chip__text'
     )
    genres = [g.get_text(strip=True) for g in genre_tags]
  
    # Languages
    languages = [l.get_text(strip=True)
                 for l in soup.select('li[data-testid="title-details-languages"] a')]

    # Country
    country = [c.get_text(strip=True)
               for c in soup.select('li[data-testid="title-details-origin"] a')]

    movie_rows.append({
        "imdb_id": imdb_id,
        "title": movie["title"],
        "rating": rating,
        "genres": ", ".join(genres),
        "languages": ", ".join(languages),
        "country_origin": ", ".join(country),
        "movie_url": movie["movie_url"]
    })

    # Top cast (limit to 10)
    cast_tags = soup.select('a[data-testid="title-cast-item__actor"]')[:10]

    for i, actor in enumerate(cast_tags, start=1):
        cast_rows.append({
            "imdb_id": imdb_id,
            "actor_name": actor.get_text(strip=True),
            "billing_order": i
        })

driver.quit()



In [22]:
movies_df = pd.DataFrame(movie_rows)
cast_df = pd.DataFrame(cast_rows)

movies_df.to_csv("movies.csv", index=False)
cast_df.to_csv("cast.csv", index=False)

print("Saved movies.csv and cast.csv")


Saved movies.csv and cast.csv
