In [1]:
# importing required libraries
import requests
from bs4 import BeautifulSoup
import re

In [2]:
link = "https://www.imdb.com/list/ls055592025/" # link that we are going to work on

In [3]:
output = requests.get(link) # downloading the imdb link using requests' get method
output.status_code # checking if webpage downloaded successfully

200

In [4]:
bs = BeautifulSoup(output.text) # creating BeautifulSoup v4 object

In [5]:
movie_data = bs.find_all('div', attrs={'class':'lister-item mode-detail'})
# movie_data is a list of 'div class' tags containing data of movies

movie_data[0] # exploring first movie in movie_data list

<div class="lister-item mode-detail">
<div class="lister-item-image ribbonize" data-tconst="tt0068646">
<a href="/title/tt0068646/"> <img alt="The Godfather" class="loadlate" data-tconst="tt0068646" height="209" loadlate="https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UY209_CR3,0,140,209_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="140"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt0068646/">The Godfather</a>
<span class="lister-item-year text-muted unbold">(1972)</span>
</h3>
<p class="text-muted text-small">
<span class="certificate">A</span>
<span class="ghost">|</span>
<span class="runtime">175 min</span>
<span class="ghost">|</span>
<span class="genre">
Crime, Drama            </span>
</p>
<div class="ipl-rating-widget">

In [6]:

data = [] # initializing list to store data of all the movie titles

for movie in movie_data:
    
    movie_details = [] # list to store data of a movie at a time
    
    name_year = movie.find('h3', attrs={'class':'lister-item-header'}) # extracting name, year and link from h3 tag
    name = name_year.text.splitlines()[2]
    year = int(name_year.text.splitlines()[3][1:5])
    link = "https://imdb.com" + name_year.find('a').get('href')
    
    runtime = int(movie.find('span', attrs={'runtime'}).text.split()[0]) # extracting runtime from runtime attribute
    genre = ' '.join(movie.find('span', attrs={'genre'}).text[1:].split()) # extracting genre from genre attribute
    rating = float(movie.find('div', attrs={'class':'ipl-rating-star small'}).text.splitlines()[-1]) # getting rating
    
    team = ''.join(movie.find_all('p', attrs={'class':'text-muted text-small'})[1].text.splitlines()[2:]).split(':')
    # getting director(s) and actor(s) from the movie
    director = team[0].split('| ')[0]
    stars = team[1]
    
    votes_n_earning = movie.find_all('span', attrs={'name':'nv'}) # getting the gross profit of the movie
    if len(votes_n_earning) == 2: # only the movies where gross is mentioned has two 'name' attribute
        gross = int(''.join(votes_n_earning[1].get('data-value').split(',')))
    else:
        gross = 0 # if no gross is available, 0 has been used in those cases
    
    '''Order of Awards data:
        Oscars: 3   
        Oscar Nominations: 11
        BAFTA Awards: 0
        BAFTA Nominations: 4
        Golden Globes: 6
        Golden Globe Nominations: 8
        also movie rank 48 and 54 have issues in 'BAFTA Award Nominations'
        so modifcations has to be made accordingly'''
    awards = [] # list storing the count of awards and nominations
    awards_data = movie.find('div', attrs={'class':'list-description'}).text.split('Stars')[3].split()
    for i in [1,4,7,10,13,17]: # to extract only the count and not other strings
        if awards_data[i] in ['N/A', 'Nominations:', 'Globes:']:
            awards.append(0)
        else:
            awards.append(int(awards_data[i]))
    
    # appending all the above found details into 'movie_details' list
    movie_details.append(name)
    movie_details.append(year)
    movie_details.append(rating)
    movie_details.append(runtime)
    movie_details.append(genre)
    movie_details.append(gross)
    movie_details.append(director)
    movie_details.append(stars)
    movie_details.extend(awards)
    movie_details.append(link)

    # finally storing the data of a movie into a list having the data from all the movies
    data.append(movie_details) 


In [7]:
import pandas as pd # importing pandas to create a pandas dataframe

In [8]:
# string with the column names
columns_list = r"title year rating runtime(mins.) genre gross(in$) director cast oscars oscar_nominations BAFTA BAFTA_nominations golden_globes golden_globe_nomination imdb_link"

In [9]:
# creating a pandas dataframe to make analyzing the data a piece of cake
df = pd.DataFrame(data, columns = columns_list.split())

In [10]:
df['rank'] = df.index + 1 # adding rank column
df.set_index('rank', inplace=True) # setting the 'rank' as the index for the pandas dataframe

In [11]:
df.head() # reading the first five rows

Unnamed: 0_level_0,title,year,rating,runtime(mins.),genre,gross(in$),director,cast,oscars,oscar_nominations,BAFTA,BAFTA_nominations,golden_globes,golden_globe_nomination,imdb_link
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,The Godfather,1972,9.2,175,"Crime, Drama",134966411,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",3,11,0,4,6,8,https://imdb.com/title/tt0068646/
2,The Shawshank Redemption,1994,9.3,142,Drama,28341469,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",0,7,0,0,0,2,https://imdb.com/title/tt0111161/
3,Schindler's List,1993,8.9,195,"Biography, Drama, History",96898818,Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley, Caro...",7,12,6,12,3,6,https://imdb.com/title/tt0108052/
4,Raging Bull,1980,8.2,129,"Biography, Drama, Sport",23383987,Martin Scorsese,"Robert De Niro, Cathy Moriarty, Joe Pesci, Fra...",2,8,2,4,1,7,https://imdb.com/title/tt0081398/
5,Casablanca,1942,8.5,102,"Drama, Romance, War",1024560,Michael Curtiz,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",3,8,0,0,0,0,https://imdb.com/title/tt0034583/
