In [1]:
import requests #importing all the libraries (for http request/parsing html / sorting & manipulating dataframe
from bs4 import BeautifulSoup
import pandas as pd
import json

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} #headers to mimic browser request
url ="https://www.imdb.com/chart/top/" #url for imdb top 25 movie chart
response = requests.get(url, headers=headers)# Sending an HTTP GET request with the headers
response.status_code
if response.status_code == 200: # setting status code 200
    print ("successfully fetched the page")
else:
    print (f"Error retrieving, status code: {response.status_code}")

successfully fetched the page


In [3]:
soup = BeautifulSoup(response.content, 'html.parser')#parsing the html content using BeautifulSoup
soup
print(soup.prettify()[:2000]) 

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1749849836820);
        }
    })
  </script>
  <title>
   IMDb Top 250 movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="0cadf7898134e79b" name="google-site-verification"/>
  <meta content="C1DACEF2769068C0B0D2687C9

In [4]:
json_data=soup.find("script",type="application/ld+json")#parsing json data
if json_data:
    data = json.loads(json_data.string)

In [5]:
titles =[] #creating empty list to hold extreacted details
url =[]
descriptions =[]
best_ratings =[]
worst_ratings =[]
ratings =[]
genres =[]
durations =[]

In [6]:
if "itemListElement" in data:#for key existence
    for item in data["itemListElement"]:  #for looping
        movie=item["item"] #to access individual data
        titles.append(movie["name"]) 
        url.append(movie["url"])
        descriptions.append(movie["description"])
        best_ratings.append(movie["aggregateRating"]["bestRating"])
        worst_ratings.append(movie["aggregateRating"]["worstRating"])
        ratings.append(float(movie["aggregateRating"]["ratingValue"]))
        genres.append(movie["genre"])
        durations.append(movie["duration"])

In [9]:
df = pd.DataFrame({ # creating dataframe
    "Title": titles,
    "URL": url,
    "Description": descriptions,
    "Best rating": best_ratings,
    "Worst rating": worst_ratings,
    "Rating": ratings,
    "Genre": genres,
    "Duration": durations
})
df.head(50) # printing first50 rows

Unnamed: 0,Title,URL,Description,Best rating,Worst rating,Rating,Genre,Duration
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,A banker convicted of uxoricide forms a friend...,10,1,9.3,Drama,PT2H22M
1,The Godfather,https://www.imdb.com/title/tt0068646/,The aging patriarch of an organized crime dyna...,10,1,9.2,"Crime, Drama",PT2H55M
2,The Dark Knight,https://www.imdb.com/title/tt0468569/,When a menace known as the Joker wreaks havoc ...,10,1,9.0,"Action, Crime, Drama",PT2H32M
3,The Godfather Part II,https://www.imdb.com/title/tt0071562/,The early life and career of Vito Corleone in ...,10,1,9.0,"Crime, Drama",PT3H22M
4,12 Angry Men,https://www.imdb.com/title/tt0050083/,The jury in a New York City murder trial is fr...,10,1,9.0,"Crime, Drama",PT1H36M
5,The Lord of the Rings: The Return of the King,https://www.imdb.com/title/tt0167260/,Gandalf and Aragorn lead the World of Men agai...,10,1,9.0,"Adventure, Drama, Fantasy",PT3H21M
6,Schindler&apos;s List,https://www.imdb.com/title/tt0108052/,"In German-occupied Poland during World War II,...",10,1,9.0,"Biography, Drama, History",PT3H15M
7,Pulp Fiction,https://www.imdb.com/title/tt0110912/,"The lives of two mob hitmen, a boxer, a gangst...",10,1,8.8,"Crime, Drama",PT2H34M
8,The Lord of the Rings: The Fellowship of the Ring,https://www.imdb.com/title/tt0120737/,A meek Hobbit from the Shire and eight compani...,10,1,8.9,"Adventure, Drama, Fantasy",PT2H58M
9,"Il buono, il brutto, il cattivo",https://www.imdb.com/title/tt0060196/,A bounty-hunting scam joins two men in an unea...,10,1,8.8,"Adventure, Drama, Western",PT2H58M


In [10]:
df.to_csv("IMDb_top_250.csv", index = False) #saving the file into csv format
print ("Data saved to 'IMDb_top_250.csv'")

Data saved to 'IMDb_top_250.csv'
