Sometime we can not afford to use the API and sometimes not available, so we may need to do web scrapping

In [1]:
# Scraping data for over 2000 movies

# Since we want to get over 2000 ratings from 
# both IMDB and Metacritic, we’ll have to make at least 4000 requests. 
# If we make one request per second, our script will need a little over 
# an hour to make 4000 requests. Because of this, it’s worth trying to identify more efficient 
# ways of obtaining our data.



In [2]:
# Identifying the URL structure

# Since we want to get over 2000 ratings from both IMDB and Metacritic, 
# we’ll have to make at least 4000 requests. If we make one request per second, 
# our script will need a little over an hour to make 4000 requests. 
# Because of this, it’s worth trying to identify more efficient ways of obtaining our data.

from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [57]:
# Using Beautiful Soup to parse the HTML content

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser') 
type(html_soup)

bs4.BeautifulSoup

In [56]:
# We will be looping in each movie to get the information related to movies

# Classs has two unique values i.e. lister-item and mode-advanced

# Extracting all these div containers 

movie_containers = html_soup.find_all('li', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
0


In [59]:
# First we will extract only first container to extract the relevant information
first_movies = movie_containers[0]
first_movies

IndexError: list index out of range

In [10]:
# The name of the movie 
first_movies.div # 1st div and name is in 3rd one

<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>

In [11]:
# Acessing the first anchor tag
first_movies.a

<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a>

In [14]:
first_movies.h3 #it brings us very close

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>

In [15]:
first_movies.h3.a 

<a href="/title/tt3315342/">Logan</a>

In [17]:
first_movies.h3.a.text

'Logan'

In [19]:
# The year of the movie's release

# e distinguishing mark consists of the values lister-item-year text-muted unbold assigned to 
# the class attribute. So we look for the first <span> with these values within the <h3> tag:

first_year = first_movies.h3.find('span', class_ = 'lister-item-year text-muted unbold')
first_year

<span class="lister-item-year text-muted unbold">(2017)</span>

In [20]:
first_year.text

'(2017)'

In [24]:
# rating
first_imdb = float(first_movies.strong.text)
first_imdb

8.1

In [26]:
# The meta score - using class to uniquely filter

first_mscore = first_movies.find('span', class_ = 'metascore favorable')
first_mscore = int(first_mscore.text)
print(first_mscore)

77


In [28]:
# The number of votes - it contains within a <span> tag. Its distinctive mark is a name 
# attribute with the value nv.

first_votes = first_movies.find('span', attrs = {'name':'nv'})
first_votes

# You can treat a Tag object just like a dictionary. 
# The HTML attributes are the dictionary’s keys. 
# The values of the HTML attributes are the values of the dictionary’s keys.

first_votes['data-value']

'567152'

In [29]:
first_votes = int(first_votes['data-value'])

## The script for a single page

We have to make sure that we will extract the data only from the containers that have a Metascore

We need to add a condition to skip movies without a Metascore

We can find the unique unique class, if it return none then we can say that no metascore in that container



In [31]:
eighth_movie_mscore = movie_containers[22].find('div', class_ = 'ratings-metascore')
type(eighth_movie_mscore)

NoneType

In [32]:
# Now we will be looping above logic in all the 50 movies of that page

# Lists to store the scraped data in 
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

for container in movie_containers:
    # If the movie has Metascore, then extract else skip:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        # The name
        name = container.h3.text
        names.append(name)
        # The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))
        
import pandas as pd
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 5 columns):
imdb         44 non-null float64
metascore    44 non-null int64
movie        44 non-null object
votes        44 non-null int64
year         44 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


Unnamed: 0,imdb,metascore,movie,votes,year
0,8.1,77,\n1.\nLogan\n(2017)\n,567152,(2017)
1,7.9,74,\n2.\nThor: Ragnarok\n(2017)\n,492105,(2017)
2,7.5,76,\n3.\nWonder Woman\n(2017)\n,491868,(2017)
3,7.7,67,\n4.\nGuardians of the Galaxy Vol. 2\n(2017)\n,490819,(2017)
4,7.9,94,\n5.\nDunkirk\n(2017)\n,471622,(2017)
5,7.1,85,\n6.\nStar Wars: Episode VIII - The Last Jedi\...,466673,(2017)
6,7.5,73,\n7.\nSpider-Man: Homecoming\n(2017)\n,436638,(2017)
7,7.7,84,\n8.\nGet Out\n(I) (2017)\n,407802,(I) (2017)
8,8.0,81,\n9.\nBlade Runner 2049\n(2017)\n,380241,(2017)
9,7.6,86,\n10.\nBaby Driver\n(2017)\n,370704,(2017)


In [34]:
# Sometimes it may happen that we are running the website from a country where English is the 
# local language, so you may get the data in other local languages 

# If you run into this issue, pass the following values to the headers parameters of the get function ()

headers = {"Accept-Language": "en-US, en;q=0.5"}

## Script for Multiple pages 

It is quite challenging, so we will scrape the first 4 pages of each year in the interval 2000-2017. 4 pages for each of the 18 years makes for a total of 72 pages. Each page has 50 movies, so we’ll scrape data for 3600 movies at most. But not all the movies have a Metascore, so the number will be lower than that. Even so, we are still very likely to get data for over 2000 movies.

<br>
<br>
In hyperlink, we have to vary the values of only two parameters of the URL; the ```release_date``` paramter and ```page```
<br>
<br>
In the next code cell we will:

- Create a list called pages, and populate it with the strings corresponding to the first 4 pages.
- Create a list called years_url and populate it with the strings corresponding to the years 2000-2017.


In [35]:
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]

## Controlling the Crawl-rate

Controlling the rate of crawling is beneficial for us, and for the website we are scraping. If we avoid hammering the server with tens of requests per second, then we are much less likely to get our IP address banned. We also avoid disrupting the activity of the website we scrape by allowing the server to respond to other users’ requests too.
<br>
<br>
We’ll control the loop’s rate by using the ```sleep()``` function from Python’s ```time``` module. ```sleep()``` will pause the execution of the loop for a specified amount of seconds.
<br>
<br>
To mimic human behavior, we’ll vary the amount of waiting time between requests by using the ```randint()``` function from the Python’s ```random``` module. ``````randint()``` randomly generates integers within a specified interval.

In [38]:
# For now let's import these two functions to prevent overcrowding 
from time import sleep
from random import randint

## Monitoring the loop as it’s still going

Given that we’re scraping 72 pages, it would be nice if we could find a way to monitor the scraping process as it’s still going. This feature is definitely optional, but it can be very helpful in the testing and debugging process. Also, the greater the number of pages, the more helpful the monitoring becomes.
<br>
<br>
For our script, we’ll make use of this feature, and monitor the following parameters:
<br>
-  The ```frequency (speed) of requests```, so we make sure our program is not overloading the server.
-  The ```number of requests```, so we can halt the loop in case the number of expected requests is exceeded.
-  The ```status code``` of our requests, so we make sure the server is sending back the proper responses.
<br>
<br>
To get a frequency value we’ll divide the number of requests by the time elapsed since the first request. This is similar to computing the speed of a car – we divide the distance by the time taken to cover that distance. Let’s experiment with this monitoring technique at a small scale first. In the following code cell we will:


- Set a starting time using the time() function from the time module, and assign the value to start_time.
- Assign 0 to the variable requests which we’ll use to count the number of requests.
- Start a loop, and then with each iteration:
        * Simulate a request.
        * Increment the number of requests by 1.
        * Pause the loop for a time interval between 8 and 15 seconds.
        * Calculate the elapsed time since the first request, and assign the value to elapsed_time.
        * Print the number of requests and the frequency.



In [40]:
from time import time
start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))

Request: 1; Frequency: 0.4942733729933237 requests/s
Request: 2; Frequency: 0.6612027073925822 requests/s
Request: 3; Frequency: 0.5969288995923274 requests/s
Request: 4; Frequency: 0.663785013496498 requests/s
Request: 5; Frequency: 0.5539186454704559 requests/s


In [43]:
# We will be clearing the request timing using clear_output function so that it should not 
# look tidy

from IPython.core.display import clear_output
start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

Request: 5; Frequency: 0.45378698471835405 requests/s


## Monitoring the warning

 To monitor the status code we’ll set the program to warn us if there’s something off. A successful request is indicated by a status code of 200. We’ll use the ```warn()``` function from the ```warnings``` module to throw a warning if the status code is not 200.

In [45]:
from warnings import warn
warn("Warning Simulation")

  


## Piecing everything together 

Now let’s piece together everything we’ve done so far! In the following code cell, we start by:

- Redeclaring the lists variables so they become empty again.
- Preparing the monitoring of the loop.

Then, we’ll:

- Loop through the years_url list to vary the release_date parameter of the URL.
- For each element in years_url, loop through the pages list to vary the page parameter of the URL.
- Make the GET requests within the pages loop (and give the headers parameter the right value to make sure we get only English content).
- Pause the loop for a time interval between 8 and 15 seconds.
- Monitor each request as discussed before.
- Throw a warning for non-200 status codes.
- Break the loop if the number of requests is greater than expected.
- Convert the response‘s HTML content to a BeautifulSoup object.
- Extract all movie containers from this BeautifulSoup object.
- Loop through all these containers.
- Extract the data if a container has a Metascore.


In [46]:
#Full Code

# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request:72; Frequency: 0.07859966667399143 requests/s


# Examining the Scraped Data

First, we will make the pandas df

In [47]:
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3280 entries, 0 to 3279
Data columns (total 5 columns):
imdb         3280 non-null float64
metascore    3280 non-null int64
movie        3280 non-null object
votes        3280 non-null int64
year         3280 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 128.2+ KB
None


Unnamed: 0,imdb,metascore,movie,votes,year
0,8.5,67,Gladiator,1220065,(2000)
1,8.5,80,Memento,1036117,(2000)
2,8.3,55,Snatch,722692,(2000)
3,8.3,68,Requiem for a Dream,704190,(2000)
4,7.4,64,X-Men,539311,(2000)
5,7.8,73,Cast Away,475305,(2000)
6,7.6,64,American Psycho,433905,(2000)
7,7.3,62,Unbreakable,354962,(2000)
8,7.0,73,Meet the Parents,293979,(2000)
9,6.1,59,Mission: Impossible II,292234,(2000)


## Cleaning the Scraped Data



In [48]:
# Ordering the columns 

movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,(2000),8.5,67,1220065
1,Memento,(2000),8.5,80,1036117
2,Snatch,(2000),8.3,55,722692
3,Requiem for a Dream,(2000),8.3,68,704190
4,X-Men,(2000),7.4,64,539311


In [49]:
# Convert the year column into integer

movie_ratings['year'].unique()

array(['(2000)', '(I) (2000)', '(2001)', '(2002)', '(2003)', '(2004)',
       '(I) (2004)', '(2005)', '(I) (2005)', '(2006)', '(I) (2006)',
       '(2007)', '(I) (2007)', '(2008)', '(I) (2008)', '(2009)',
       '(I) (2009)', '(2010)', '(I) (2010)', '(2011)', '(I) (2011)',
       '(2012)', '(I) (2012)', '(2013)', '(I) (2013)', '(2014)',
       '(I) (2014)', '(II) (2014)', '(2015)', '(I) (2015)', '(II) (2015)',
       '(2016)', '(II) (2016)', '(I) (2016)', '(IX) (2016)', '(2017)',
       '(I) (2017)'], dtype=object)

In [50]:
# Counting from the end toward beginning, we can see that the years are always located 
# from the fifth character to the second. We’ll use the .str() method to select only 
# that interval. We’ll also convert the result to an integer using the astype() method:

movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)

In [51]:
movie_ratings['year'].head(3)

0    2000
1    2000
2    2000
Name: year, dtype: int32

In [53]:
# Checking the min and max values
movie_ratings.describe().loc[['min', 'max'], ['imdb', 'metascore']]

Unnamed: 0,imdb,metascore
min,4.1,24.0
max,9.0,100.0


In [54]:
#We’ll multiply each IMDB rating by 10, and then we’ll do a quick check by looking at the first 3 rows:

movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10
movie_ratings.head(3)

Unnamed: 0,movie,year,imdb,metascore,votes,n_imdb
0,Gladiator,2000,8.5,67,1220065,85.0
1,Memento,2000,8.5,80,1036117,85.0
2,Snatch,2000,8.3,55,722692,83.0


In [55]:
# Saving the dataset:
movie_ratings.to_csv('movie_ratings.csv')