### Import necessary libraries

In [1]:
import pandas as pd # Import the Pandas library and alias it as 'pd'
import requests # Import the 'requests' library for making HTTP requests

from bs4 import BeautifulSoup # Import the BeautifulSoup class from the 'bs4' (Beautiful Soup 4) library

In [2]:
# Define a User-Agent string in the 'headers' dictionary
# This User-Agent string simulates a web browser (Google Chrome on Windows)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}

In [3]:
# Define a URL pointing to the IMDb Top 250 Movies chart
url = '''https://www.imdb.com/chart/top/'''

In [4]:
# Send an HTTP GET request to the specified URL using the 'requests' library
# Include the 'headers' dictionary to mimic a web browser user agent 
response = requests.get(url,headers=headers)

# 'response' now contains the HTTP response received from the website
response

<Response [200]>

In [5]:
# Create a BeautifulSoup object by parsing the HTML content of the HTTP response
soup = BeautifulSoup(response.content,"html.parser")

In [6]:
# Create empty lists to store movie names, release years, duration, ratings, and the number of reviews
movie_name = []        # To store movie names
release_year = []      # To store release years
duration_of_movie = [] # To store movie durations
rating = []            # To store movie ratings
num_of_likes = []    # To store the number of likes for each movie

In [7]:
# Use BeautifulSoup's find_all method to locate all elements with the specified class attribute
scraped_movie = soup.find_all("div",class_="sc-6fa21551-0 hyRLjF cli-children")

# 'scraped_movie' now contains a list of elements that match the criteria
scraped_movie[:2]

[<div class="sc-6fa21551-0 hyRLjF cli-children"><div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-6fa21551-9 dKJKsK cli-title"><a class="ipc-title-link-wrapper" href="/title/tt0111161/?ref_=chttp_t_1" tabindex="0"><h3 class="ipc-title__text">1. The Shawshank Redemption</h3></a></div><div class="sc-6fa21551-7 jLjTzn cli-title-metadata"><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">1994</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">2h 22m</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">A</span></div><span class="sc-6fa21551-1 GNFYN"><div class="sc-e3e7b191-0 iKUUVe sc-6fa21551-2 kOfhdG cli-ratings-container" data-testid="ratingGroup--container"><span aria-label="IMDb rating: 9.3" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24

In [8]:
# Iterate through the 'scraped_movie' list and extract the movie names from each element
for movie in scraped_movie:
    # Extract the movie name from the <h3> tag and append it to the 'movie_name' list
    movie_name.append(movie.h3.text)

In [9]:
movie_name

['1. The Shawshank Redemption',
 '2. The Godfather',
 '3. The Dark Knight',
 '4. The Godfather: Part II',
 '5. 12 Angry Men',
 "6. Schindler's List",
 '7. The Lord of the Rings: The Return of the King',
 '8. Pulp Fiction',
 '9. The Lord of the Rings: The Fellowship of the Ring',
 '10. Il Buono, Il Brutto, Il Cattivo',
 '11. Forrest Gump',
 '12. Fight Club',
 '13. The Lord of the Rings: The Two Towers',
 '14. Inception',
 '15. Star Wars: Episode V - The Empire Strikes Back',
 '16. The Matrix',
 '17. GoodFellas',
 "18. One Flew Over the Cuckoo's Nest",
 '19. Se7en',
 '20. Spider-man: Across the Spider-verse',
 "21. It's a Wonderful Life",
 '22. Shichinin No Samurai',
 '23. Interstellar',
 '24. The Silence of the Lambs',
 '25. Saving Private Ryan',
 '26. City of God',
 '27. Life Is Beautiful',
 '28. The Green Mile',
 '29. Star Wars: Episode IV - A New Hope',
 '30. Terminator 2: Judgment Day',
 '31. Back to the Future',
 '32. Spirited Away',
 '33. The Pianist',
 '34. Psycho',
 '35. Parasit

In [10]:
# Calculate the number of movie names in the 'movie_name' list
len(movie_name)

250

In [11]:
# Use BeautifulSoup's find_all method to locate all elements with the specified class attribute
scraped_year_duration = soup.find_all("div",class_="sc-6fa21551-7 jLjTzn cli-title-metadata",)
# 'scraped_year_duration' now contains a list of elements that match the criteria
scraped_year_duration[:2]

[<div class="sc-6fa21551-7 jLjTzn cli-title-metadata"><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">1994</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">2h 22m</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">A</span></div>,
 <div class="sc-6fa21551-7 jLjTzn cli-title-metadata"><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">1972</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">2h 55m</span><span class="sc-6fa21551-8 bnyjtW cli-title-metadata-item">A</span></div>]

In [12]:
# Iterate through the 'scraped_year_duration' list and extract release year and duration for each movie
for year in scraped_year_duration:
    
    # Extract the release year (the first <span> element) and append it to the 'release_year' list
    release_year.append(year.find_all("span")[0].text.strip())
    # Extract the duration (the second <span> element) and append it to the 'duration_of_movie' list
    duration_of_movie.append(year.find_all("span")[1].text.strip())

In [13]:
# Calculate the number of movie durations in the 'duration_of_movie' list
len(duration_of_movie)

250

In [14]:
# Calculate the number of release years in the 'release_year' list
len(release_year)

250

In [15]:
# Use BeautifulSoup's find_all method to locate all elements with the specified class attribute
scraped_rating_reviews = soup.find_all("span","ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating")

# 'scraped_rating_reviews' now contains a list of elements that match the criteria
scraped_rating_reviews[:2]

[<span aria-label="IMDb rating: 9.3" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M12 20.1l5.82 3.682c1.066.675 2.37-.322 2.09-1.584l-1.543-6.926 5.146-4.667c.94-.85.435-2.465-.799-2.567l-6.773-.602L13.29.89a1.38 1.38 0 0 0-2.581 0l-2.65 6.53-6.774.602C.052 8.126-.453 9.74.486 10.59l5.147 4.666-1.542 6.926c-.28 1.262 1.023 2.26 2.09 1.585L12 20.099z"></path></svg>9.3<span class="ipc-rating-star--voteCount"> (<!-- -->2.8M<!-- -->)</span></span>,
 <span aria-label="IMDb rating: 9.2" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M12 20.1l5.82 3.682c1.066.675 2

In [16]:
# Iterate through the 'scraped_rating_reviews' list and extract the ratings for each movie
for rate in scraped_rating_reviews:
    # Extract the rating text and append it to the 'rating' list
    rating.append(rate.text)

In [17]:
# Calculate the number of movie ratings in the 'rating' list
len(rating)

250

In [18]:
# Create a Pandas DataFrame using the lists containing movie data
df = pd.DataFrame({"movie_name":movie_name,
             "release_year":release_year,
             "duration":duration_of_movie,
             "Rating":rating})

### Data Preprocessing

In [19]:
df.head()

Unnamed: 0,movie_name,release_year,duration,Rating
0,1. The Shawshank Redemption,1994,2h 22m,9.3 (2.8M)
1,2. The Godfather,1972,2h 55m,9.2 (2M)
2,3. The Dark Knight,2008,2h 32m,9.0 (2.8M)
3,4. The Godfather: Part II,1974,3h 22m,9.0 (1.3M)
4,5. 12 Angry Men,1957,1h 36m,9.0 (835K)


In [20]:
# Split the "movie_name" column by periods and keep the second part (index 1)
# Then, remove any leading or trailing whitespace
df["movie_name"].str.split(".").str[1].str.strip()

0      The Shawshank Redemption
1                 The Godfather
2               The Dark Knight
3        The Godfather: Part II
4                  12 Angry Men
                 ...           
245               The 400 Blows
246                     Persona
247               Life of Brian
248                     Aladdin
249                    Drishyam
Name: movie_name, Length: 250, dtype: object

In [21]:
# Assigned it to dataframe
df["movie_name"] = df["movie_name"].str.split(".").str[1].str.strip()

In [22]:
df.head()

Unnamed: 0,movie_name,release_year,duration,Rating
0,The Shawshank Redemption,1994,2h 22m,9.3 (2.8M)
1,The Godfather,1972,2h 55m,9.2 (2M)
2,The Dark Knight,2008,2h 32m,9.0 (2.8M)
3,The Godfather: Part II,1974,3h 22m,9.0 (1.3M)
4,12 Angry Men,1957,1h 36m,9.0 (835K)


In [23]:
# Split the "Rating" column by "(" and keep the part before the opening parenthesis (index 0)
# Then, remove any leading or trailing whitespace, and update the "Rating" column
df["Rating"].str.split("(").str[0].str.strip()

0      9.3
1      9.2
2      9.0
3      9.0
4      9.0
      ... 
245    8.1
246    8.1
247    8.0
248    8.0
249    8.2
Name: Rating, Length: 250, dtype: object

In [24]:
# Assigned it to dataframe
df["rating"] = df["Rating"].str.split("(").str[0].str.strip()

In [25]:
# Split the "Rating" column by "(" and keep the part after the opening parenthesis (index 1)
# Then, replace any ")" characters with an empty string, and strip any leading or trailing whitespace
df["Rating"].str.split("(").str[1].str.replace(")","").str.strip()

0      2.8M
1        2M
2      2.8M
3      1.3M
4      835K
       ... 
245    125K
246    127K
247    415K
248    450K
249     91K
Name: Rating, Length: 250, dtype: object

In [26]:
# Assigned it to dataframe
df["num_of_likes"] = df["Rating"].str.split("(").str[1].str.replace(")","").str.strip()

In [27]:
df.head()

Unnamed: 0,movie_name,release_year,duration,Rating,rating,num_of_likes
0,The Shawshank Redemption,1994,2h 22m,9.3 (2.8M),9.3,2.8M
1,The Godfather,1972,2h 55m,9.2 (2M),9.2,2M
2,The Dark Knight,2008,2h 32m,9.0 (2.8M),9.0,2.8M
3,The Godfather: Part II,1974,3h 22m,9.0 (1.3M),9.0,1.3M
4,12 Angry Men,1957,1h 36m,9.0 (835K),9.0,835K


In [28]:
# Remove the "Rating" column from the DataFrame 'df'
df.drop("Rating",axis=1,inplace=True)

In [29]:
df.head()

Unnamed: 0,movie_name,release_year,duration,rating,num_of_likes
0,The Shawshank Redemption,1994,2h 22m,9.3,2.8M
1,The Godfather,1972,2h 55m,9.2,2M
2,The Dark Knight,2008,2h 32m,9.0,2.8M
3,The Godfather: Part II,1974,3h 22m,9.0,1.3M
4,12 Angry Men,1957,1h 36m,9.0,835K


In [30]:
# Save the DataFrame 'df' as a CSV file named "movies.csv" without including the index
df.to_csv("movies.csv",index=False)

In [31]:
# Save the DataFrame 'df' as an Excel file named "movies.xlsx" without including the index
df.to_excel("movies.xlsx",index=False)