In [24]:
#required libraries
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [25]:
#URL for the IMDb Oscar Winning Movies
url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc'
response = requests.get(url)

# Save HTML to file
with open("imdb_oscar_winning_films.html", mode='wb') as file:
    file.write(response.content)

In [26]:
#using the beautifulsoup library for the web-scraping
soup = BeautifulSoup(response.content, 'lxml')

In [27]:
#data of interest
movie_audience_rating = []
movie_name = []
movie_link_imdb = []
movie_meta_rating = []
number_of_votes = []

#get the main page div tag content
divTag = soup.find_all("div", {"class":"lister-item mode-advanced"})

#loop over to extract all required information
for rating in divTag:
    rate = rating.find('div',class_='inline-block ratings-imdb-rating').find('strong').contents[:]
    name = rating.find('div',class_='lister-item-content').find('a').contents[:]
    link = 'https://www.imdb.com/' + rating.find('div',class_='lister-item-content').find('a').get('href')
    meta_rate = rating.find('div',class_='inline-block ratings-metascore')
    votes = rating.find('p',class_="sort-num_votes-visible").contents[3].contents[0].strip().replace(',','')
    
    """
    few of the movies from early days don't have critic meta score 
    and soup will return None for this movies. To avoid the failure 
    of the find function we confirm the rating is available if it's 
    not available then we set it to the zero
    """
    
    if meta_rate != None:
        meta_rate =  meta_rate.find('span').contents[:]
        movie_meta_rating.append(meta_rate[0].replace(' ',''))
    else:
        movie_meta_rating.append(str(0))
        
    movie_audience_rating.append(rate[0])
    movie_name.append(name[0])
    movie_link_imdb.append(link)
    number_of_votes.append(votes)

In [28]:
#create a list of all data of interest and convert it to the data frame
movie_df = list(zip(movie_name, movie_audience_rating, number_of_votes, movie_meta_rating, movie_link_imdb))

movie_df = pd.DataFrame(movie_df)
movie_df.columns = ['movie_name', 'audience_rating', 'number_of_votes', 'movie_meta_rating', 'movie_link_imdb']

In [29]:
#another 2 data-set of interest
all_movie_genres = []
number_of_critic_review = []

for url in movie_df['movie_link_imdb']:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    divTag = soup.find_all("li", {"class":"ipc-inline-list__item ipc-chip__text"})
    critic_review = soup.find_all("a", {"class":"ipc-link ipc-link--baseAlt ipc-link--touch-target sc-124be030-2 eshTwQ isReview"})
    movie_genres = []
    for rating in divTag:
        movie_genres.append(rating.contents[0])
    all_movie_genres.append(movie_genres)
    
    critic_score =  critic_review[1].find('span', class_='score').contents[0]
    
    """
    If rating critic review is more than 1000 than IMDb uses K short form
    to convert this K to integer we utilizes this loop
    """
    
    if critic_score != None:
        if critic_score[-1] == 'K':
            critic_score = critic_score[:-1]+'000'
        number_of_critic_review.append(critic_score)
    else:
        movie_meta_rating.append(str(0))

In [30]:
#add these data-set to movie_df dataframe

movie_df.insert(5, "movie_genres", all_movie_genres[:], True)
movie_df.insert(4, "number_critic_review", number_of_critic_review[:], True)

In [31]:
#change the datatype of each column as required

convert_dict = {'movie_name': str,
                'audience_rating': float,
                'number_of_votes': int,
                'movie_meta_rating': int,
                'number_critic_review': int,
               }
  
movie_df = movie_df.astype(convert_dict)
print(movie_df.dtypes)

movie_name               object
audience_rating         float64
number_of_votes           int32
movie_meta_rating         int32
number_critic_review      int32
movie_link_imdb          object
movie_genres             object
dtype: object


In [32]:
#Final DataFrame
movie_df

Unnamed: 0,movie_name,audience_rating,number_of_votes,movie_meta_rating,number_critic_review,movie_link_imdb,movie_genres
0,CODA,8.0,112996,74,246,https://www.imdb.com//title/tt10366460/,"[Comedy, Drama, Music]"
1,Nomadland,7.3,152313,93,392,https://www.imdb.com//title/tt9770150/,[Drama]
2,Parasite,8.5,750618,96,602,https://www.imdb.com//title/tt6751668/,"[Comedy, Drama, Thriller]"
3,Green Book,8.2,467592,69,422,https://www.imdb.com//title/tt6966692/,"[Biography, Comedy, Drama]"
4,Shape of Water: Das Flüstern des Wassers,7.3,410678,87,1000,https://www.imdb.com//title/tt5580390/,"[Drama, Fantasy, Romance]"
...,...,...,...,...,...,...,...
90,Pioniere des wilden Westens,5.9,6183,70,41,https://www.imdb.com//title/tt0021746/,"[Drama, Western]"
91,Im Westen nichts Neues,8.1,60986,91,97,https://www.imdb.com//title/tt0020629/,"[Drama, War]"
92,The Broadway Melody,5.6,7222,0,40,https://www.imdb.com//title/tt0019729/,"[Drama, Musical, Romance]"
93,Flügel aus Stahl,7.6,12852,0,95,https://www.imdb.com//title/tt0018578/,"[Drama, Romance, War]"
