In [12]:
import requests
from bs4 import BeautifulSoup
import csv 
import bs4  
import pandas as pd 
import time  
from urllib.parse import urljoin  


In [13]:

# Define the URL of the IMDb top 250 movies page
url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

# Import Selenium-related libraries for web automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Import time for adding time delays
import time

# Define the timeout duration for web page loading
TIMEOUT = 5

# Create a Chrome WebDriver instance with specified options
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})

# Print a message indicating the web page URL being retrieved
print(f"Retrieving web page URL '{url}'")

# Create a WebDriver instance using Chrome
driver = webdriver.Chrome(options=options)
driver.get(url)

# Add a time delay to allow the page to load
time.sleep(TIMEOUT)

# Capture the HTML source code of the loaded page
html = driver.page_source

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Close the WebDriver instance
driver.quit()

# Print a message indicating the completion of the web scraping process
print(f"Done")


Retrieving web page URL 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
Done


In [14]:

# Define a function to extract movie titles from the IMDb page
def get_movie_titles(soup):
    # Find 'li' tags with class attributes that contain movie information
    li_tags_with_h3 = soup.find_all('li', {'class': 'ipc-metadata-list-summary-item sc-59b6048d-0 jemTre cli-parent'})

    # Extract 'h3' tags within 'li' tags that contain movie titles
    h3_tags_within_li = [li.find('h3') for li in li_tags_with_h3 if li.find('h3')]

    # Extract movie titles from 'h3' tags
    movie_titles = [h3.text.strip() for h3 in h3_tags_within_li]

    return movie_titles


In [15]:

# Define a function to extract movie release years, durations, and ratings
def get_year_duration_rating(soup):
    # Find 'li' tags with class attributes that contain movie information
    li_tags_with_info = soup.find_all('li', {'class': 'ipc-metadata-list-summary-item sc-59b6048d-0 jemTre cli-parent'})

    # Initialize lists to store movie information
    release_years = []
    durations = []
    ratings = []

    # Loop through the selected 'li' tags
    for li in li_tags_with_info:
        # Find all 'span' tags within the 'li' tag
        span_tags = li.find_all('span', {'class': 'sc-4dcdad14-8 cvucyi cli-title-metadata-item'})

        # Initialize variables for each piece of information
        release_year = "Not Provided"
        duration = "Not Provided"
        rating = "Not Provided"

        # Extract release year information from the first 'span' tag
        release_year = span_tags[0].text.strip()
        
        # Check if there are at least two 'span' tags (for Year and Duration)
        if len(span_tags) >= 2:
            duration = span_tags[1].text.strip()
            
            # Check if the third 'span' tag exists (for Rating)
            if len(span_tags) >= 3:
                rating = span_tags[2].text.strip()

        # Append the information to their respective lists
        release_years.append(release_year)
        durations.append(duration)
        ratings.append(rating)

    return release_years, durations, ratings


In [16]:

# Define a function to extract IMDb ratings
def get_movie_stars(soup):
    # Find 'li' tags with class attributes that contain movie information
    li_tags_with_info = soup.find_all('li', {'class': 'ipc-metadata-list-summary-item sc-59b6048d-0 jemTre cli-parent'})

    # Initialize a list to store IMDb ratings
    imdb_ratings = []

    # Loop through the selected 'li' tags
    for li in li_tags_with_info:
        # Find the rating element with IMDb rating
        rating_element = li.find('span', class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating')
        
        # Extract IMDb rating from the rating element, if it exists
        if rating_element:
            imdb_rating = rating_element.text.split('\xa0')[0]
            imdb_ratings.append(imdb_rating)
        else:
            imdb_ratings.append("N/A") 

    return imdb_ratings


In [17]:

# Define a function to extract movie URLs
def get_movie_urls(soup):
    # Find 'li' tags with class attributes that contain movie information
    li_tags_with_info = soup.find_all('li', {'class': 'ipc-metadata-list-summary-item sc-59b6048d-0 jemTre cli-parent'})

    # Initialize a list to store movie URLs
    movie_urls = []

    # Loop through the selected 'li' tags
    for li in li_tags_with_info:
        # Find the 'a' tag within the 'li' tag
        a_tag = li.find('a', {'class': 'ipc-title-link-wrapper'})
        
        # Check if the 'a' tag exists and contains an 'href' attribute
        if a_tag and 'href' in a_tag.attrs:
            # Extract the relative link from the 'href' attribute
            relative_link = a_tag['href']

            # Combine the base IMDb URL with the relative link to form a complete URL
            complete_link = urljoin("https://www.imdb.com", relative_link)
            
            # Append the complete URL to the list
            movie_urls.append(complete_link)

    return movie_urls


In [18]:

# Call functions to extract movie data
movie_titles = get_movie_titles(soup)
release_years, durations, ratings = get_year_duration_rating(soup)
imdb_ratings = get_movie_stars(soup)
movie_urls = get_movie_urls(soup)


In [19]:

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Movie Title': movie_titles,
    'Movie Release Year': release_years,
    'IMDb Rating': imdb_ratings,
    'Movie Duration': durations,
    'Movie Rating (MPA)': ratings,
    'Movie URL IMDb': movie_urls
})

# Define the name of the CSV file
csv_file = 'movie_data.csv'

# Save the DataFrame to a CSV file
df.to_csv(csv_file, index=False)

# Print a message indicating the successful creation of the CSV file
print(f'CSV file "{csv_file}" created successfully.')


CSV file "movie_data.csv" created successfully.
