In [67]:
pip install requests beautifulsoup4 pandas




In [69]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [73]:
url = "https://www.imdb.com/chart/top/"
response = requests.get(url)

# Check status
print("Status Code:", response.status_code)

Status Code: 403


In [75]:
url = "https://www.imdb.com/chart/top/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)

print("Status Code:", response.status_code)

Status Code: 200


In [77]:
soup = BeautifulSoup(response.text, "html.parser")

# Just to check, print the page title
print(soup.title.text)

IMDb Top 250 movies


In [79]:
movies = soup.select("td.titleColumn")
ratings = soup.select("td.imdbRating strong")

data = []
for movie, rating in zip(movies, ratings):
    title = movie.a.text
    year = movie.span.text.strip("()")
    rating = rating.text
    data.append([title, year, rating])

print("Total movies found:", len(data))
print(data[:5])  # show first 5 movies

Total movies found: 0
[]


In [81]:
print(soup.prettify()[:2000])  # show first 2000 characters of HTML

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1759218397900);
        }
    })
  </script>
  <title>
   IMDb Top 250 movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="0cadf7898134e79b" name="google-site-verification"/>
  <meta content="C1DACEF2769068C0B0D2687C9

In [95]:
import json

# find the JSON-LD script
json_data = soup.find("script", type="application/ld+json").string

# load it into Python
data = json.loads(json_data)

# Extract movies
movies = data["itemListElement"]

movies_list = []
for item in movies:
    movie = item["item"]
    title = movie["name"]
    url = movie["url"]
    image = movie.get("image", None)
    
    # Extract rating if available
    rating = movie.get("aggregateRating", {}).get("ratingValue", None)
    
    movies_list.append([title, url, image, rating])

# Convert to DataFrame
df = pd.DataFrame(movies_list, columns=["Title", "URL", "Image", "Rating"])
df.head(10)

Unnamed: 0,Title,URL,Image,Rating
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,https://m.media-amazon.com/images/M/MV5BMDAyY2...,9.3
1,The Godfather,https://www.imdb.com/title/tt0068646/,https://m.media-amazon.com/images/M/MV5BZTg0Zj...,9.2
2,The Dark Knight,https://www.imdb.com/title/tt0468569/,https://m.media-amazon.com/images/M/MV5BMTMxNT...,9.1
3,The Godfather Part II,https://www.imdb.com/title/tt0071562/,https://m.media-amazon.com/images/M/MV5BNDNkZD...,9.0
4,12 Angry Men,https://www.imdb.com/title/tt0050083/,https://m.media-amazon.com/images/M/MV5BYjE4Nz...,9.0
5,The Lord of the Rings: The Return of the King,https://www.imdb.com/title/tt0167260/,https://m.media-amazon.com/images/M/MV5BMTZkMj...,9.0
6,Schindler&apos;s List,https://www.imdb.com/title/tt0108052/,https://m.media-amazon.com/images/M/MV5BNjM1ZD...,9.0
7,The Lord of the Rings: The Fellowship of the Ring,https://www.imdb.com/title/tt0120737/,https://m.media-amazon.com/images/M/MV5BNzIxMD...,8.9
8,Pulp Fiction,https://www.imdb.com/title/tt0110912/,https://m.media-amazon.com/images/M/MV5BYTViYT...,8.8
9,"Il buono, il brutto, il cattivo",https://www.imdb.com/title/tt0060196/,https://m.media-amazon.com/images/M/MV5BMWM5Zj...,8.8


In [97]:
def make_clickable(val):
    return f'<a href="{val}" target="_blank">Link</a>'

def make_image(val):
    return f'<img src="{val}" width="80">'

styled_df = df.head(10).style.format({
    "URL": make_clickable,
    "Image": make_image,
    "Rating": "{:.1f}"
})
styled_df

Unnamed: 0,Title,URL,Image,Rating
0,The Shawshank Redemption,Link,,9.3
1,The Godfather,Link,,9.2
2,The Dark Knight,Link,,9.1
3,The Godfather Part II,Link,,9.0
4,12 Angry Men,Link,,9.0
5,The Lord of the Rings: The Return of the King,Link,,9.0
6,Schindler's List,Link,,9.0
7,The Lord of the Rings: The Fellowship of the Ring,Link,,8.9
8,Pulp Fiction,Link,,8.8
9,"Il buono, il brutto, il cattivo",Link,,8.8
