#  WEB SCRAP PRACTİCE

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# The URL of the IMDb Top 250 page
url = "https://www.imdb.com/chart/top/"

# Set up the Selenium WebDriver
options = webdriver.ChromeOptions()
# Add the language preference to the browser options
options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
# options.add_argument('--headless') # Uncomment this to run the browser in the background
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

print("Opening the page...")
driver.get(url)

# Scroll down to the bottom of the page multiple times to load all the movies
print("Scrolling to load all 250 movies...")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2) # Wait for the new content to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get the full page source after all content has loaded
full_html_source = driver.page_source

# Close the browser
driver.quit()
print("Scraping complete, browser closed.")

Opening the page...
Scrolling to load all 250 movies...
Scraping complete, browser closed.


In [3]:
# Parse the content
soup = BeautifulSoup(full_html_source, "html.parser")

In [4]:
film_list = soup.find('ul', class_='ipc-metadata-list--dividers-between').find_all('li')
film_list

[<li class="ipc-metadata-list-summary-item"><div class="ipc-metadata-list-summary-item__c"><div class="ipc-metadata-list-summary-item__tc"><span aria-disabled="false" class="ipc-metadata-list-summary-item__t ipc-btn--not-interactable"></span><div class="sc-ec40e84d-1 dwYbao cli-parent li-compact"><div class="sc-ec40e84d-0 dTHKNo"><div class="sc-d0224b4e-0 jfogmY cli-poster-container"><div class="ipc-poster ipc-poster--base ipc-poster--media-radius ipc-poster--wl-true ipc-poster--dynamic-width ipc-sub-grid-item ipc-sub-grid-item--span-2" role="group"><div class="ipc-media ipc-media--poster-27x40 ipc-image-media-ratio--poster-27x40 ipc-media--media-radius ipc-media--base ipc-media--poster-s ipc-poster__poster-image ipc-media__img" style="width:100%"><img alt="Tim Robbins in The Shawshank Redemption (1994)" class="ipc-image" loading="lazy" sizes="50vw, (min-width: 480px) 34vw, (min-width: 600px) 26vw, (min-width: 1024px) 16vw, (min-width: 1280px) 16vw" src="https://m.media-amazon.com/imag

In [5]:
# Create empty lists to store movie data
names = []
years = []
ratings = []

# Loop through each movie item and extract its data
for film in film_list:
    name = film.find("h3", class_="ipc-title__text").text.split(". ", 1)[1]
    year = film.find("span", class_="cli-title-metadata-item").text
    rating = film.find("span", class_="ipc-rating-star").text
    
    # Append the data to the respective lists
    names.append(name)
    years.append(year)
    ratings.append(rating)

# Create a dictionary from the lists
movie_data = {"Movie_Name": names, "Production_Year": years, "IMDB_Rating": ratings}

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(movie_data)


# Set the index to start from 1
df.index = df.index + 1

# Print the DataFrame
print(df)

                                       Movie_Name Production_Year IMDB_Rating
1                        The Shawshank Redemption            1994  9.3 (3.1M)
2                                   The Godfather            1972  9.2 (2.1M)
3                                 The Dark Knight            2008  9.1 (3.1M)
4                           The Godfather Part II            1974  9.0 (1.4M)
5                                    12 Angry Men            1957  9.0 (941K)
..                                            ...             ...         ...
246                                 Groundhog Day            1993  8.0 (722K)
247                                      The Help            2011  8.1 (517K)
248  Gekijô-ban Kimetsu no Yaiba Mugen Ressha-hen            2020   8.2 (87K)
249                                      Drishyam            2015  8.2 (103K)
250                            Gangs of Wasseypur            2012  8.2 (109K)

[250 rows x 3 columns]


In [6]:
df.tail(10)

Unnamed: 0,Movie_Name,Production_Year,IMDB_Rating
241,Ah-ga-ssi,2016,8.1 (192K)
242,La battaglia di Algeri,1966,8.1 (72K)
243,The Grapes of Wrath,1940,8.1 (106K)
244,To Be or Not to Be,1942,8.1 (48K)
245,Into the Wild,2007,8.0 (683K)
246,Groundhog Day,1993,8.0 (722K)
247,The Help,2011,8.1 (517K)
248,Gekijô-ban Kimetsu no Yaiba Mugen Ressha-hen,2020,8.2 (87K)
249,Drishyam,2015,8.2 (103K)
250,Gangs of Wasseypur,2012,8.2 (109K)


In [7]:
df.sort_values(by="Production_Year", axis=0, ascending=True, na_position="first")

Unnamed: 0,Movie_Name,Production_Year,IMDB_Rating
138,The Kid,1921,8.2 (143K)
204,Sherlock Jr.,1924,8.1 (63K)
202,The Gold Rush,1925,8.1 (125K)
208,The General,1926,8.1 (104K)
119,Metropolis,1927,8.3 (196K)
...,...,...,...
44,Spider-Man: Across the Spider-Verse,2023,8.5 (472K)
120,Oppenheimer,2023,8.3 (921K)
176,The Wild Robot,2024,8.2 (191K)
217,Maharaja,2024,8.4 (72K)
