In [1]:
# importing libraries needed for web scraping and then to store the collected
# data into an excel file

import requests # to access the website/webpage
from bs4 import BeautifulSoup # to parse the HTML source code of the webpage
import openpyxl # to save the collected raw data into an excel file
import pandas as pd # for data analysis

In [2]:
excel = openpyxl.Workbook() # creating an excel file
#excel.sheetnames # sheets present in the excel file created
sheet = excel.active # to make the relevant sheet active for loading data 
sheet.title = 'IMDB Top 250 Movies' #renaming the sheet
sheet.append(['Movie Rank', 'Movie Name', 'Year of Release', 'IMDB Rating'])
# appending column names into the sheet

In [3]:
try:
    source = requests.get('https://www.imdb.com/chart/top/') 
    # to generate a response object which is stored in 'source' variable
    # this contains some components of the webpage along with its HTML code
    
    source.raise_for_status() # to capture/throw an error if the webpage we
                              # are trying to access has any issue
        
    parsed_html_code = BeautifulSoup(source.text, 'html.parser')
    # to capture the HTML code(.text), parsing it and then storing it in
    # the variable
    
    # finding and parsing HTML tags and their corresponding class to extract
    # the required data
    
    movies = parsed_html_code.find('tbody', class_ = "lister-list").find_all('tr') 
    # each 'tr' tag contains the data about one movie                                            
    
    for movie in movies:
        
        rank = movie.find('td', class_ = "titleColumn").get_text(strip=True).split('.')[0]
        
        name = movie.find('td', class_ = "titleColumn").find('a').text
              
        year = movie.find('td', class_ = "titleColumn").find('span', class_ = "secondaryInfo").text.strip("(").strip(")")
        
        rating = movie.find('td', class_ = "ratingColumn imdbRating").find('strong').text
        
        sheet.append([rank, name, year, rating])
    

except Exception as e:
    print(e)
    

excel.save('IMDB Top 250 Movies.xlsx')

In [4]:
top_250_imdb_movies = pd.read_excel('IMDB Top 250 Movies.xlsx')

In [5]:
top_250_imdb_movies

Unnamed: 0,Movie Rank,Movie Name,Year of Release,IMDB Rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.2
2,3,The Dark Knight,2008,9.0
3,4,The Godfather Part II,1974,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Aladdin,1992,8.0
246,247,Jai Bhim,2021,8.0
247,248,Gandhi,1982,8.0
248,249,The Help,2011,8.0


In [17]:
top_250_imdb_movies.dtypes

Movie Rank           int64
Movie Name          object
Year of Release      int64
IMDB Rating        float64
dtype: object