## Scraping IMDB website using BeautifulSoup 

In [1]:
from requests import get

In [2]:
url = "http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1"

#### Inspecting the link

The search is detailed in the link http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1


In [3]:
response = get(url)
response

<Response [200]>

Response 200 means that we are successfull in obtaining data from the website

In [4]:
from bs4 import BeautifulSoup as bs ## importing Beautiful Soup
html_soup = bs(response.text,'html.parser')## python's in built library HTML parser
type(html_soup)

bs4.BeautifulSoup

In [5]:
id_check = html_soup.find(id ="main")
movie_container = id_check.find_all(class_ ="lister-item mode-advanced")
len(movie_container)

50

In [6]:
container = movie_container[0]
meta = container.find("span", class_ = "metascore").text
print(meta)

77        


###### find
find_all() returned a ResultSet object with the length of 50 movies we are interested in.


In [7]:
#list to store scraped value data in:
movie_names = []
year_release = []
imdb_ratings = []
metascores = []
votes = []
movie_description = []
certificate = []
runtime = []
genre = []
director_name = []
star_cast = []
gross_value = []

#extract data from individual movie container
for container in movie_container:
    
   
    #if movie has Metascore, then extract:
    if container.find("div", class_ = "ratings-metascore") is not None:
        
        
        #the movie_name
        name = container.h3.a.text
        movie_names.append(name)
        
        #the year od release
        release = container.find("span", class_ = "lister-item-year text-muted unbold").text
        year_release.append(release)
        
        #the ratings for the movies
        ratings = float(container.strong.text)
        imdb_ratings.append(ratings)
        
        #the metascores
        meta = container.find("span", class_ = "metascore").text
        metascores.append(int(meta))
        
        #the votes
        vote = container.find("span", attrs = {"name":"nv"})['data-value']
        votes.append(int(vote))
        
        #the certificate
        certi = container.find("span", class_ = "certificate").text
        certificate.append(certi)
        
        #the runtime
        run = container.find("span", class_ ="runtime").text
        runtime.append(run)
        
        #the genre
        gen = container.find("span", class_ ="genre").text
        genre.append(gen)
        
        #fetching all <p> tags
        content = container.find_all("p")
        
        #the description
        desc = content[1].text
        movie_description.append(desc)
        
        #subsetting all the <a> tags in 3rd <p> tag
        content_2 = content[2].find_all("a")
        
        #the director
        director = content_2[0].text
        director_name.append(director)
        
        #the gross value
        if len(container.find_all("span", attrs = {"name":"nv"})) >= 2:
            gross = container.find_all("span", attrs = {"name":"nv"})[1]['data-value']
            gross_value.append(gross)
        else:
            gross_value.append(None)
        #extracting artists names
        temp = []
        for i in range(len(content_2)-1):
            temp.append(content_2[i].text)
        star_cast.append(temp)

In [8]:

import pandas as pd

test_df = pd.DataFrame({"movie_names":movie_names,
                        "year_release":year_release,
                        "imdb_ratings":imdb_ratings,
                        "metscores":metascores,
                        "votes":votes,
                        "movie_description":movie_description,
                        "certificate":certificate,
                        "runtime":runtime,
                        "genre":genre,
                        "director_name": director_name,
                        "star_cast": star_cast,
                        "gross_value":gross_value
                       })

In [9]:
test_df

Unnamed: 0,certificate,director_name,genre,gross_value,imdb_ratings,metscores,movie_description,movie_names,runtime,star_cast,votes,year_release
0,R,James Mangold,"\nAction, Drama, Sci-Fi",226277068.0,8.1,77,"\n In the near future, a weary Logan cares ...",Logan,137 min,"[James Mangold, Hugh Jackman, Patrick Stewart,...",529950,(2017)
1,PG-13,Patty Jenkins,"\nAction, Adventure, Fantasy",412563408.0,7.5,76,\n When a pilot crashes and tells of confli...,Wonder Woman,141 min,"[Patty Jenkins, Gal Gadot, Chris Pine, Robin W...",460266,(2017)
2,PG-13,Christopher Nolan,"\nAction, Drama, History",188373161.0,7.9,94,"\n Allied soldiers from Belgium, the Britis...",Dunkirk,106 min,"[Christopher Nolan, Fionn Whitehead, Barry Keo...",441726,(2017)
3,PG-13,Rian Johnson,"\nAction, Adventure, Fantasy",620181382.0,7.2,85,\n Rey develops her newly discovered abilit...,Star Wars: Episode VIII - The Last Jedi,152 min,"[Rian Johnson, Daisy Ridley, John Boyega, Mark...",438956,(2017)
4,PG-13,James Gunn,"\nAction, Adventure, Comedy",389813101.0,7.7,67,\n The Guardians must fight to keep their n...,Guardians of the Galaxy Vol. 2,136 min,"[James Gunn, Chris Pratt, Zoe Saldana, Dave Ba...",437482,(2017)
5,PG-13,Taika Waititi,"\nAction, Adventure, Comedy",315058289.0,7.9,74,"\n Thor is imprisoned on the planet Sakaar,...",Thor: Ragnarok,130 min,"[Taika Waititi, Chris Hemsworth, Tom Hiddlesto...",422571,(2017)
6,PG-13,Jon Watts,"\nAction, Adventure, Sci-Fi",334201140.0,7.5,73,\n Peter Parker balances his life as an ord...,Spider-Man: Homecoming,133 min,"[Jon Watts, Tom Holland, Michael Keaton, Rober...",383895,(2017)
7,R,Jordan Peele,"\nHorror, Mystery, Thriller",176040665.0,7.7,84,\n A young African-American visits his whit...,Get Out,104 min,"[Jordan Peele, Daniel Kaluuya, Allison William...",358814,(I) (2017)
8,R,Denis Villeneuve,"\nDrama, Mystery, Sci-Fi",92054159.0,8.0,81,\n A young blade runner's discovery of a lo...,Blade Runner 2049,164 min,"[Denis Villeneuve, Harrison Ford, Ryan Gosling...",356689,(2017)
9,R,Edgar Wright,"\nAction, Crime, Drama",107825862.0,7.6,86,\n After being coerced into working for a c...,Baby Driver,112 min,"[Edgar Wright, Ansel Elgort, Jon Bernthal, Jon...",342302,(2017)
