# Scraping IMDB Top 250 Movies

### Initial Scraping From IMDB Steps below: 
### URL: https://www.imdb.com/chart/top/

In [3]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By # used to import different ways to access data in the XML or HTML file
from selenium.webdriver.chrome.service import Service # no longer need to download a driver file, use service
from webdriver_manager.chrome import ChromeDriverManager # used to manage the Chrome driver to emulate a Chrome web browser

In [4]:
# Initialize Selenium WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# List to store scraped data
Top_250_movies_scraped_raw = []

url = f"https://www.imdb.com/chart/top/?ref_=nv_mv_250&sort=user_rating%2Cdesc"
driver.get(url)
driver.maximize_window()

In [5]:
## MWC Working - title, year, rating, runtime, and url

top_250_scraped=[]
movies_scraped = driver.find_elements(By.CLASS_NAME, 'ipc-metadata-list-summary-item')

for movie in movies_scraped:
    # Scrape the movie title
    movie_title = movie.find_element(By.CLASS_NAME, 'ipc-title__text').text.strip()
    
    # Scrape the year
    try:
        year_element = movie.find_element(By.XPATH, ".//span[contains(@class, 'cli-title-metadata-item')][1]")
        movie_year = year_element.text.strip()
    except Exception:
        movie_year = "N/A"  # Handle cases where the year is not found

    # Scrape the runtime
    try:
        metadata_elements = movie.find_elements(By.XPATH, ".//span[contains(@class, 'cli-title-metadata-item')]")
        if len(metadata_elements) > 1:  # Assuming runtime is the second span
            movie_runtime = metadata_elements[1].text.strip()
        else:
            movie_runtime = "N/A"
    except Exception:
        movie_runtime = "N/A"  # Handle cases where the runtime is not found

    # Scrape the rating
    try:
        rating_element = movie.find_element(By.XPATH, ".//span[contains(@class, 'ipc-rating-star--rating')]")
        movie_rating = rating_element.text.strip()
    except Exception:
        movie_rating = "N/A"  # Handle cases where the rating is not found

    # Scrape the URL
    try:
        link_element = movie.find_element(By.XPATH, ".//a[@class='ipc-title-link-wrapper']")
        movie_url = link_element.get_attribute('href')
    except Exception:
        movie_url = "N/A"  # Handle cases where the link is not found

    print(f"Title: {movie_title}, Year: {movie_year}, Runtime: {movie_runtime}, Rating: {movie_rating}, URL: {movie_url}")


    top_250_scraped.append({
            "title": movie_title,
            "year": movie_year,
            "runtime": movie_runtime,
            "rating": movie_rating,
            "url": movie_url
        })

print(top_250_scraped)

# convert lists to a pandas datadataframe
print("building the dataframe")
top_250_df = pd.DataFrame(top_250_scraped)

# # perist data in a CSV file
print("saving the CSV")
top_250_df.to_csv("top_250_raw.csv", header=True, index=False, sep=",", encoding='utf-8')




Title: 1. The Shawshank Redemption, Year: 1994, Runtime: 2h 22m, Rating: 9.3, URL: https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1
Title: 2. The Godfather, Year: 1972, Runtime: 2h 55m, Rating: 9.2, URL: https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2
Title: 3. The Dark Knight, Year: 2008, Runtime: 2h 32m, Rating: 9.0, URL: https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3
Title: 6. The Lord of the Rings: The Return of the King, Year: 2003, Runtime: 3h 21m, Rating: 9.0, URL: https://www.imdb.com/title/tt0167260/?ref_=chttp_t_4
Title: 5. 12 Angry Men, Year: 1957, Runtime: 1h 36m, Rating: 9.0, URL: https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5
Title: 7. Schindler's List, Year: 1993, Runtime: 3h 15m, Rating: 9.0, URL: https://www.imdb.com/title/tt0108052/?ref_=chttp_t_6
Title: 4. The Godfather Part II, Year: 1974, Runtime: 3h 22m, Rating: 9.0, URL: https://www.imdb.com/title/tt0071562/?ref_=chttp_t_7
Title: 9. The Lord of the Rings: The Fellowship of the Ring, Year: 200

In [6]:


# List to store results
top_250_details = []

# Loop through the URLs in the DataFrame
for index, url in top_250_df['url'].dropna().items():
    try:
        # Visit the URL
        print(f"Visiting URL: {url}")
        driver.get(url)
        time.sleep(8)  # Allow time for the page to load (adjust as needed)

        # Scrape the popularity score
        try:
            popularity_element = driver.find_element(By.XPATH, "//div[@data-testid='hero-rating-bar__popularity__score']")
            popularity_score = popularity_element.text.strip()
        except Exception:
            popularity_score = "N/A"  # Handle cases where the score is not found

        print(f"Popularity Score: {popularity_score}")

        
        # Scrape the genres
        try:
            genres_element = driver.find_elements(By.XPATH, "//div[@data-testid='genres']//span[@class='ipc-chip__text']")
            genres = [genre.text for genre in genres_element]  # Extract text from all genre elements
        except Exception:
            genres = []  # Handle cases where genres are not found

        print(f"Genres: {', '.join(genres)}")

        
        # Scrape the release date
        try:
            release_date_element = driver.find_element(By.XPATH, "//li[@data-testid='title-details-releasedate']//a[@class='ipc-metadata-list-item__list-content-item']")
            release_date = release_date_element.text.strip()
        except Exception:
            release_date = "N/A"  # Handle cases where the release date is not found

        print(f"Release Date: {release_date}")

       
        # Append results to the list
        top_250_details.append({
            'url': url, 
            'Popularity Score': popularity_score, 
            'Genres': genres, 
            'Release Date': release_date
        })

    except Exception as e:
        print(f"Error visiting {url}: {e}")

    print(f"top 250 movie detail records: {len(top_250_details)}")

# convert lists to a pandas datadataframe
print("building the dataframe")
top_250_details_df = pd.DataFrame(top_250_details)

# # perist data in a CSV file
print("saving the CSV")
top_250_details_df.to_csv("top_250_details_raw.csv", header=True, index=False, sep=",", encoding='utf-8')


Visiting URL: https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1
Popularity Score: 80
Genres: 
Release Date: N/A
top 250 movie detail records: 1
Visiting URL: https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2
Popularity Score: 106
Genres: 
Release Date: N/A
top 250 movie detail records: 2
Visiting URL: https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3
Popularity Score: 146
Genres: 
Release Date: N/A
top 250 movie detail records: 3
Visiting URL: https://www.imdb.com/title/tt0167260/?ref_=chttp_t_4
Popularity Score: 221
Genres: 
Release Date: N/A
top 250 movie detail records: 4
Visiting URL: https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5
Popularity Score: 248
Genres: 
Release Date: N/A
top 250 movie detail records: 5
Visiting URL: https://www.imdb.com/title/tt0108052/?ref_=chttp_t_6
Popularity Score: 263
Genres: 
Release Date: N/A
top 250 movie detail records: 6
Visiting URL: https://www.imdb.com/title/tt0071562/?ref_=chttp_t_7
Popularity Score: 368
Genres: 
Release Date: N

In [29]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Load existing CSV
top_250_df = pd.read_csv("top_250_raw.csv")

# Initialize Selenium WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()

# Ensure the columns exist
if "popularity_score" not in top_250_df.columns:
    top_250_df["popularity_score"] = "N/A"
if "genres" not in top_250_df.columns:
    top_250_df["genres"] = "N/A"
if "release_date" not in top_250_df.columns:
    top_250_df["release_date"] = "N/A"

for idx, row in top_250_df.iterrows():
    url = row.get("url", None)
    if not url or pd.isna(url):
        continue
    
    print(f"Visiting URL: {url}")
    driver.get(url)
    time.sleep(8)  # Adjust as needed to ensure the page is fully loaded

    # Scrape the popularity score
    try:
        popularity_element = driver.find_element(By.XPATH, "//div[@data-testid='hero-rating-bar__popularity__score']")
        popularity_score = popularity_element.text.strip()
    except Exception:
        popularity_score = "N/A"

    print(f"Popularity Score: {popularity_score}")

    # Scrape the genres
    try:
        genres_element = driver.find_element(By.XPATH, "//div[@data-testid='interests']")
        # Use the same logic as before: replace newlines with commas
        genres = genres_element.text.replace("\n", ", ")
    except Exception:
        genres = "N/A"

    print(f"Genres: {genres}")

    # Scrape the release date
    try:
        release_date_element = driver.find_element(By.XPATH, "//li[@data-testid='title-details-releasedate']")
        release_date_text = release_date_element.text
        # Remove "Release date" part by splitting on newline
        release_date = release_date_text.split("\n", 1)[1]
        if "(" in release_date:
            release_date = release_date.rsplit(" ", 2)[0]
    except Exception:
        release_date = "N/A"

    print(f"Release Date: {release_date}")

    # Update the DataFrame
    top_250_df.at[idx, "popularity_score"] = popularity_score
    top_250_df.at[idx, "genres"] = genres
    top_250_df.at[idx, "release_date"] = release_date

# Save updated DataFrame to CSV
top_250_df.to_csv("top_250_raw.csv", index=False, encoding='utf-8')
print("Updated CSV saved successfully.")

driver.quit()


Visiting URL: https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1
Popularity Score: 80
Genres: Epic, Period Drama, Prison Drama, Drama
Release Date: October 14, 1994
Visiting URL: https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2


  top_250_df.at[idx, "popularity_score"] = popularity_score
  top_250_df.at[idx, "genres"] = genres
  top_250_df.at[idx, "release_date"] = release_date


Popularity Score: 106
Genres: Epic, Gangster, Tragedy, Crime, Drama
Release Date: March 24, 1972
Visiting URL: https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3
Popularity Score: 146
Genres: Action Epic, Epic, Superhero, Action, Crime, Drama, Thriller
Release Date: July 18, 2008
Visiting URL: https://www.imdb.com/title/tt0167260/?ref_=chttp_t_4
Popularity Score: 221
Genres: Adventure Epic, Epic, Fantasy Epic, Mountain Adventure, Quest, Sword & Sorcery, Tragedy, Adventure, Drama, Fantasy
Release Date: December 17, 2003
Visiting URL: https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5
Popularity Score: 248
Genres: Legal Drama, Psychological Drama, Crime, Drama
Release Date: April 10, 1957
Visiting URL: https://www.imdb.com/title/tt0108052/?ref_=chttp_t_6
Popularity Score: 263
Genres: Docudrama, Epic, Historical Epic, Period Drama, Prison Drama, Biography, Drama, History
Release Date: February 4, 1994
Visiting URL: https://www.imdb.com/title/tt0071562/?ref_=chttp_t_7
Popularity Score: