## Scrapper for IMDB movies
https://www.dataquest.io/blog/web-scraping-beautifulsoup/

In [3]:
from requests import get
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
# print(response.text[:50]) #to only show the beg

html_soup = BeautifulSoup(response.text, 'html.parser')
print(type(html_soup))

## often the distinctive mark resides in the "class" "Attribute" ##
# If you inspect the HTML lines of the containers of interest, you'll notice that the class attribute has two values: lister-item and mode-advanced
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers)) #find_all() returned a ResultSet object which is a list containing all the 50 divs
print(len(movie_containers))

##
#example of 1st movie container, which is still a bunch of html
first_movie = movie_containers[0]
# dir(movie_containers[0]) #for all the available object functions 
# vars(first_movie) #to find all the available object attributes of 

#(i) movie name
#we can see that the name is contained within an anchor tag (<a>), nested within <h3> of this "tag" object
print(first_movie.h3.a.text) #the href is the hyperlink, but the text is just the movie name

#(ii) year of movie release
#the year is stored close to the name, within the <span> tag below the <a>
print(first_movie.h3.span) #this is insufficient, so we need the find() or find_all() functions
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold') #want the 1st so find
first_year = first_year.text
print(first_year)

#(iii) the IMDB rating
#within the ratings bar with the <h3> from before, but surrounded by <strong> so lets use that
first_imdb = float(first_movie.strong.text)
print(first_imdb)

#(iv) the metascore
first_metascore = first_movie.find('span', 'metascore favorable')
first_metascore = int(first_metascore.text)
print(first_metascore)

#(v) number of votes
first_votes = first_movie.find('span', attrs = {'name':'nv'})
first_votes = int(first_votes['data-value'])
print(first_votes)

##



<class 'bs4.BeautifulSoup'>
<class 'bs4.element.ResultSet'>
50
Logan
<span class="lister-item-index unbold text-primary">1.</span>
(2017)
8.1
77
523335


In [4]:
## Script for a single page
import pandas as pd

# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Extract data from individual movie container
for container in movie_containers:
    
    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        
        # name
        names.append(container.h3.a.text)
        
        # year
        years.append(container.h3.find('span', class_ = 'lister-item-year text-muted unbold').text)
        
        # imdb_rating
        imdb_ratings.append(float(container.strong.text))
        
        # metascore
        metascores.append(int(container.find('span',class_ = 'metascore').text))
        
        # vote
        votes.append(int(container.find('span', attrs = {'name':'nv'})['data-value']))

test_df = pd.DataFrame({'movie':names,
                       'year':years,
                       'imdb':imdb_ratings,
                       'metascore':metascores,
                       'vote':votes})
#print(test_df.info())
test_df

        

Unnamed: 0,movie,year,imdb,metascore,vote
0,Logan,(2017),8.1,77,523335
1,Wonder Woman,(2017),7.5,76,451649
2,Dunkirk,(2017),7.9,94,433137
3,Star Wars: Episode VIII - The Last Jedi,(2017),7.2,85,431046
4,Guardians of the Galaxy Vol. 2,(2017),7.7,67,427884
5,Thor: Ragnarok,(2017),7.9,74,410125
6,Spider-Man: Homecoming,(2017),7.5,73,374799
7,Get Out,(I) (2017),7.7,84,349006
8,Blade Runner 2049,(2017),8.0,81,348764
9,Baby Driver,(2017),7.6,86,336911


In [5]:
## Script for multiple pages

# Plan
# 1. making the request within a loop
# 2. controling the loop rate to avoid bombarding servers
# 3. monitoring loop while running


from time import time
from time import sleep
from IPython.core.display import clear_output
from warnings import warn
from random import randint

# Creating the variables for the url
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
headers = {"Accept-Language": "en-US, en;q=0.5"}

# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:
        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url + 
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 10:
            warn('Number of requests was greater than expected.')  
            break 

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year 
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))



Request:26; Frequency: 0.0837170230029067 requests/s


In [8]:
# checking the data
movie_ratings = pd.DataFrame({'movie': names,
                              'year': years,
                              'imdb': imdb_ratings,
                              'metascore': metascores,
                              'votes': votes})
print(movie_ratings.info())
movie_ratings.head(100)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472 entries, 0 to 471
Data columns (total 5 columns):
movie        472 non-null object
year         472 non-null object
imdb         472 non-null float64
metascore    472 non-null int64
votes        472 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 18.5+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,(2000),8.5,67,1171537
1,Memento,(2000),8.5,80,1003870
2,Snatch,(2000),8.3,55,700151
3,Requiem for a Dream,(2000),8.3,68,680651
4,X-Men,(2000),7.4,64,521282
5,Cast Away,(2000),7.8,73,459574
6,American Psycho,(2000),7.6,64,418152
7,Unbreakable,(2000),7.3,62,310092
8,Meet the Parents,(2000),7.0,73,288024
9,Mission: Impossible II,(2000),6.1,59,282885


In [45]:
# cleaning the scraped data

# - reorder the columns
# - cleaning 'year'
# - checking extreme values
# - normalizing ratings

original_df = movie_ratings #just in case
movie_ratings = movie_ratings[['movie','year','imdb','metascore','votes']] #order them by column of choosing
print(movie_ratings.head())
# print(movie_ratings.year.loc[1].isnumeric()) #year is not a numeric


##converting year into integers
movie_ratings['year'].unique()
movie_ratings.iloc[0,1] #iloc find the value based on the location in the df

# notice that one year is '(III)' so replace by 2013
# movie_ratings['year'].filter(like='(III)')
# movie_ratings.replace('(III)','(2013)')
# movie_ratings['year'].unique()

# counting from the end to the beginning, the date starts at the 5th character
movie_ratings.loc[:,'year'] = movie_ratings.loc[:,'year'].str[-5:-1].astype('int')
movie_ratings.head(3)



                 movie  year  imdb  metascore    votes
0            Gladiator  2000   8.5         67  1171537
1              Memento  2000   8.5         80  1003870
2               Snatch  2000   8.3         55   700151
3  Requiem for a Dream  2000   8.3         68   680651
4                X-Men  2000   7.4         64   521282


AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [68]:
##checking the min and max
# --> pandas describe() method can be useful for that
movie_ratings.describe()
# pd.DataFrame.describe(movie_ratings)



Unnamed: 0,year,imdb,metascore,votes
count,472.0,472.0,472.0,472.0
mean,2000.788136,7.088136,60.919492,273815.7
std,0.758272,0.806257,17.092414,230784.4
min,2000.0,5.3,24.0,92711.0
25%,2000.0,6.5,48.0,136794.8
50%,2001.0,7.2,61.5,201602.5
75%,2001.0,7.7,74.0,288024.0
max,2002.0,8.8,96.0,1458934.0


In [None]:
##save the data on a csv
movie_ratings.to_csv('movie_ratings.csv')