### Getting details of top 100 movies from IMDb Website

In [1]:
#importing required Libraries
import pandas as pd 
import requests 
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# Getting the required url
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'

In [3]:
# Sending HTTP request
page_read = requests.get(url)

In [4]:
# getting the HTML content in the soup object
soup = BeautifulSoup(page_read.content)

In [5]:
# this gives us the html version in the bs object #make the indentation proper
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 1000"
(Sorted by IMDb Rating Descending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/search/title/?groups=top_1000" rel="canonical"/>
  <meta content="http://www

#### Create an empty array for all the elements that we need to extract from the webpage

In [6]:
movie_name = []
year = []
rank = []
time_in_mins = []
rating = []
metascore = []
votes = []
gross = []
description = []

#### First on our list is the movie name
- Select moviename and rt click inspect
- We See that it is under 'div' tag with class name as 'lister-item mode-advanced'
- also for every movies there is a separate class with same name
- therefore we get all the elements in a list below

In [7]:
# findall you can find a tag (div here), with its content or attribute
movie_data = soup.findAll('div', attrs= {'class': 'lister-item mode-advanced'})

In [8]:
# to check if we have all the 100 movie list of the page
len(movie_data)

100

In [9]:
# check the first element in list
movie_data[0]

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt0111161"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt0111161/"> <img alt="The Shawshank Redemption" class="loadlate" data-tconst="tt0111161" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/S/sash/4FyxwxECzL-U1J8.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt0111161/">The Shawshank Redemption</a>
<span class="lister-item-year text-muted unbold">(1994)</span>
</h3>
<p class="text-muted">
<span class="certificate">A</span>
<span class="ghost">|</span>
<span class="runtime">142 min</span>
<span class="ghost">|</span>
<span class="genre">
Dram

- The next step is to append data to the empty lists created
- Here it is done together, but for testing purpose it is advisable to check every list elements individually 

In [10]:
for ls in movie_data:
    
    #if we inspect the name of the movie, we see that it is under h3 and a tag
    name = ls.h3.a.text
    movie_name.append(name)
    
    # rank and year both are in the same h3 tag as the movie name
    # we have the rank as
    ranklist_ = ls.h3.find('span', class_="lister-item-index unbold text-primary").text.replace('.','')
    rank.append(ranklist_)
    
    # Now we retrieve the year in the h3 tag
    # go to 1994 on webpge, select right click inspect , rt click copy element, remove everything except the class name.
    release_year = ls.h3.find('span',class_="lister-item-year text-muted unbold").text.replace('(','').replace(')','')
    year.append(release_year)
    
    #finding runtime, click on any runtime and inspect, it is in p element, and inside span tag and class as class_= "runtime"
    # convert to text and remove the min
    runtime_ = ls.p.find('span',class_= "runtime").text.replace(' min','')
    time_in_mins.append(runtime_)
    
    # On inspecting rating, we find that it is under div tag with class as "inline-block ratings-imdb-rating"
    rating_ = ls.find('div',class_= "inline-block ratings-imdb-rating").text.replace('\n','')
    rating.append(rating_)
    
    # Metascore, we need if else here as metascore is not always present
    meta_ = ls.find('span',class_= "metascore").text.replace(' ','') if ls.find('span',class_= "metascore") else 'meta not present'
    metascore.append(meta_)
    
    # Votes and Gross(revenue)
    # we see that both votes and gross are under tha same span tag with a name called 'nv' and not a class
    # so we use attribute here attrs, instead of class
    value = ls.find_all('span', attrs = {'name':'nv'})
    vote_ = value[0].text
    votes.append(vote_)
    
    #Gross: at times value not there.
    gross_ = value[1].text if len(value) > 1 else "not known"
    gross.append(gross_)
    
    #Movie description
    # it is in p element, with class "text-muted"
    desc_list = ls.find_all('p',class_='text-muted')
    desc_ = desc_list[1].text.replace('\n', '') if len(desc_list) >1 else 'Not present'
    description.append(desc_)
    
    
    

In [15]:
# checking if values are extracted
description[0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'

In [16]:
# create a data frame with the above lists
movie_DF = pd.DataFrame({'Name of movie': movie_name, 'Year of relase': year, 'Watchtime': time_in_mins, 'Movie Rating': rating, 
                         'Metascore': metascore, 'Votes': votes, 'Gross collection': gross, 'Description': description})



In [17]:
movie_DF.head(5)

Unnamed: 0,Name of movie,Year of relase,Watchtime,Movie Rating,Metascore,Votes,Gross collection,Description
0,The Shawshank Redemption,1994,142,9.3,81,2666718,$28.34M,Two imprisoned men bond over a number of years...
1,The Godfather,1972,175,9.2,100,1847935,$134.97M,The aging patriarch of an organized crime dyna...
2,The Dark Knight,2008,152,9.0,84,2639639,$534.86M,When the menace known as the Joker wreaks havo...
3,The Lord of the Rings: The Return of the King,2003,201,9.0,94,1838264,$377.85M,Gandalf and Aragorn lead the World of Men agai...
4,Schindler's List,1993,195,9.0,94,1350203,$96.90M,"In German-occupied Poland during World War II,..."


In [18]:
movie_DF.tail(5)

Unnamed: 0,Name of movie,Year of relase,Watchtime,Movie Rating,Metascore,Votes,Gross collection,Description
95,Witness for the Prosecution,1957,116,8.4,76,126023,$8.18M,A veteran British barrister must defend his cl...
96,Paths of Glory,1957,88,8.4,90,197941,#60,"After refusing to attack an enemy position, a ..."
97,Sunset Blvd.,1950,110,8.4,meta not present,222485,#59,A screenwriter develops a dangerous relationsh...
98,The Great Dictator,1940,125,8.4,meta not present,224441,$0.29M,Dictator Adenoid Hynkel tries to expand his em...
99,Chhichhore,2019,143,8.3,meta not present,54938,$0.90M,"A tragic incident forces Anirudh, a middle-age..."
