# Web Scraping with BeautifulSoup

In [1]:
from requests import get

In [2]:
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=1"
response = get(url, headers=headers)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [3]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
movie_containers = html_soup.find_all('div', class_= 'lister-item mode-advanced')
print(len(movie_containers))

50


## Name of the movie

In [6]:
first_movie = movie_containers[0]

In [7]:
name = first_movie.h3.a

In [8]:
print(name)

<a href="/title/tt3315342/">Logan</a>


In [9]:
first_name = first_movie.h3.a.text

In [10]:
print(first_name)

Logan


## Year of the movie

In [11]:
# selector <span class="lister-item-year text-muted unbold">(2017)</span>
first_year_code = first_movie.h3.find('span', class_="lister-item-year text-muted unbold")
print(first_year_code)

<span class="lister-item-year text-muted unbold">(2017)</span>


In [12]:
first_year = first_year_code.text
print(first_year)

(2017)


## IMDB rating

In [13]:
# selector <div class="inline-block ratings-imdb-rating" name="ir" data-value="7,9">
#                   <span class="global-sprite rating-star imdb-rating"></span>
#                   <strong>7,9</strong>
#          </div>

imdb_code = first_movie.strong
print(imdb_code)

<strong>8.1</strong>


In [14]:
first_imdb = float(imdb_code.text)

## The Metascore

In [15]:
# selector <span class="metascore  favorable">77        </span>
metascore_code = first_movie.find('span', class_="metascore favorable")
print(metascore_code)

<span class="metascore favorable">77        </span>


In [16]:
first_metascore = int(metascore_code.text)
print(first_metascore)

77


## The number of votes

In [17]:
# selector <span name="nv" data-value="591656">591.656</span>
votes = first_movie.find('span', attrs = {'name':'nv'})
print(votes)

<span data-value="591671" name="nv">591,671</span>


In [18]:
first_votes = int(votes['data-value'])
print(first_votes)

591671


# The script for a single page

In [19]:
# Lists to store the scraped data
names = []
years = []
imdb_ratings = []
metascore = []
votes = []

#Extract the data from individual film container
for container in movie_containers:
    #If the movie has metascore, then extract
    if container.find('div', class_='ratings-metascore') is not None:
        name = container.h3.a.text
        names.append(name)
        
        year = container.h3.find('span', class_="lister-item-year").text
        years.append(year)
        
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        
        m_score = container.find('span', class_="metascore").text
        metascore.append(int(m_score))
        
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

In [20]:
print(names)

['Logan', 'Thor: Ragnarok', 'Guardians of the Galaxy Vol. 2', 'Wonder Woman', 'Star Wars: Episode VIII - The Last Jedi', 'Dunkirk', 'Spider-Man: Homecoming', 'Get Out', 'It', 'Blade Runner 2049', 'Baby Driver', 'Three Billboards Outside Ebbing, Missouri', 'Justice League', 'The Shape of Water', 'John Wick: Chapter 2', 'Coco', 'Jumanji: Welcome to the Jungle', 'Beauty and the Beast', 'Kong: Skull Island', 'Kingsman: The Golden Circle', 'Pirates of the Caribbean: Dead Men Tell No Tales', 'Alien: Covenant', 'The Greatest Showman', 'War for the Planet of the Apes', 'Lady Bird', 'Life', 'The Fate of the Furious', 'Murder on the Orient Express', 'Ghost in the Shell', 'Wind River', 'King Arthur: Legend of the Sword', 'Call Me by Your Name', "The Hitman's Bodyguard", 'Mother!', 'The Mummy', 'Atomic Blonde', 'Bright', 'I, Tonya', 'Valerian and the City of a Thousand Planets', 'Darkest Hour', 'Baywatch', 'American Made', 'Transformers: The Last Knight']


## Put the data in pandas

In [21]:
import pandas as pd
test_df = pd.DataFrame({
    'movie': names,
    'year': years,
    'imdb': imdb_ratings,
    'metascore': metascore,
    'votes': votes
})
print(test_df)

                                               movie        year  imdb  \
0                                              Logan      (2017)   8.1   
1                                     Thor: Ragnarok      (2017)   7.9   
2                     Guardians of the Galaxy Vol. 2      (2017)   7.6   
3                                       Wonder Woman      (2017)   7.4   
4            Star Wars: Episode VIII - The Last Jedi      (2017)   7.1   
5                                            Dunkirk      (2017)   7.9   
6                             Spider-Man: Homecoming      (2017)   7.4   
7                                            Get Out  (I) (2017)   7.7   
8                                                 It  (I) (2017)   7.3   
9                                  Blade Runner 2049      (2017)   8.0   
10                                       Baby Driver      (2017)   7.6   
11         Three Billboards Outside Ebbing, Missouri      (2017)   8.2   
12                                    

In [22]:
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 5 columns):
movie        43 non-null object
year         43 non-null object
imdb         43 non-null float64
metascore    43 non-null int64
votes        43 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


## The script for multiple pages

In [23]:
# Create a list called pages, and populate it with the strings corresponding to the first 4 pages.
# Create a list called years_url and populate it with the strings corresponding to the years 2000-2017.

pages = [str(i) for i in range(1, 5)]
years_url = [str(i) for i in range(2000, 2018)]

In [24]:
# Controlling the crawl-rate
from time import sleep
from random import randint

In [25]:
# Monitoring the loop as it’s still going
from IPython.core.display import clear_output # not working!!!

# Set a starting time using the time() function from the time module, and assign the value to start_time.
from time import time
start_time = time()

# Assign 0 to the variable requests which we’ll use to count the number of requests.
requests = 0

# Start a loop, and then with each iteration: 
#    Simulate a request.
#    Increment the number of requests by 1.
#    Pause the loop for a time interval between 8 and 15 seconds.
#    Calculate the elapsed time since the first request, and assign the value to elapsed_time.
#    Print the number of requests and the frequency.

for _ in range(5):
    #Request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}: Frequency: {} requests/s'.format(requests, requests/elapsed_time))

# We’ll clear the output after each iteration
clear_output(wait = True) # wait parameter of clear_output to wait replacing until new output

Request: 1: Frequency: 0.3332980246331822 requests/s
Request: 2: Frequency: 0.3332571892405298 requests/s
Request: 3: Frequency: 0.3331706699941425 requests/s
Request: 4: Frequency: 0.3331110173030252 requests/s
Request: 5: Frequency: 0.3330795103066422 requests/s


In [26]:
# We’ll use the warn() function from the warnings module to throw a warning if the status code is not 200.

from warnings import warn
#warn("Warning Simulation")

# Everything together

In [27]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

start_time = time()
requests = 0

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request:72; Frequency: 0.07870943534265927 requests/s


# Examining the scraped data

In [28]:
movie_ratings = pd.DataFrame({
    'movie': names,
    'year': years,
    'imdb': imdb_ratings,
    'metascore': metascores,
    'votes': votes
})
print(movie_ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 5 columns):
movie        3276 non-null object
year         3276 non-null object
imdb         3276 non-null float64
metascore    3276 non-null int64
votes        3276 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 128.1+ KB
None


In [29]:
print(movie_ratings.head(10))

                    movie    year  imdb  metascore    votes
0               Gladiator  (2000)   8.5         67  1254118
1                 Memento  (2000)   8.4         80  1057944
2                  Snatch  (2000)   8.3         55   738209
3     Requiem for a Dream  (2000)   8.3         68   720429
4                   X-Men  (2000)   7.4         64   548069
5               Cast Away  (2000)   7.8         73   486521
6         American Psycho  (2000)   7.6         64   448209
7             Unbreakable  (2000)   7.3         62   366945
8        Meet the Parents  (2000)   7.0         73   297958
9  Mission: Impossible II  (2000)   6.1         59   297939


In [30]:
# Convert all the values in the year column to integers.
movie_ratings['year'].unique()

array(['(2000)', '(I) (2000)', '(2001)', '(2002)', '(2003)', '(2004)',
       '(I) (2004)', '(2005)', '(I) (2005)', '(2006)', '(I) (2006)',
       '(2007)', '(I) (2007)', '(2008)', '(I) (2008)', '(2009)',
       '(I) (2009)', '(2010)', '(I) (2010)', '(2011)', '(I) (2011)',
       '(2012)', '(I) (2012)', '(2013)', '(I) (2013)', '(2014)',
       '(I) (2014)', '(II) (2014)', '(2015)', '(I) (2015)', '(II) (2015)',
       '(2016)', '(II) (2016)', '(I) (2016)', '(IX) (2016)', '(2017)',
       '(I) (2017)'], dtype=object)

In [31]:
movie_ratings.loc[:,'year'] = movie_ratings['year'].str[-5:-1].astype(int)
movie_ratings['year'].head()

0    2000
1    2000
2    2000
3    2000
4    2000
Name: year, dtype: int64

In [32]:
movie_ratings.describe().loc[['min', 'max'], ['year', 'imdb', 'metascore', 'votes']]

Unnamed: 0,year,imdb,metascore,votes
min,2000.0,4.1,24.0,96490.0
max,2017.0,9.0,100.0,2156613.0


There are no unexpected outliers.

In [34]:
# Normalize metascore and imdb to 0-100 score.
movie_ratings['imdb'] = movie_ratings['imdb'] * 10
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,2000,85.0,67,1254118
1,Memento,2000,84.0,80,1057944
2,Snatch,2000,83.0,55,738209
3,Requiem for a Dream,2000,83.0,68,720429
4,X-Men,2000,74.0,64,548069


In [35]:
# Save the results
movie_ratings.to_csv('/home/anomalia/Escritorio/movie_ratings.csv')