# Import and Scrape

In [1]:
from requests import get

First I will set the initial page to scrape and print out the first 500 characters of the text of response, which here is just pure HTML so far.

In [2]:
sci_fi_page1 = "https://www.imdb.com/search/title?genres=sci-fi&explore=title_type,genres&ref_=adv_prv"

response = get(sci_fi_page1)
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle",


In [3]:
from bs4 import BeautifulSoup

Next, I pass in the response text with the argument stating that I want to use Python's built-in HTML parser to create a, well, beautiful soup. I checked the type here.

In [4]:
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

Now I need to find the containers that contain the movie information on the page in order to drill down further in the tags to get that specific `<a href="..."></a>`
tag containing the movie title.

In [5]:
sci_fi_containers = html_soup.find_all('div', class_= 'lister-item mode-advanced')
print(type(sci_fi_containers)) #to double check that I got a ResultSet
print(len(sci_fi_containers)) #to double check I got 50 (elements/page)

<class 'bs4.element.ResultSet'>
50


First I will make sure I can flawlessly capture the data from the first movie in the list before scaling. The first movie is item zero in the ResultSet `sci_fi_containers`.

In [8]:
movie_one = sci_fi_containers[0]
movie_one

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt2737304"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt2737304/?ref_=adv_li_i"> <img alt="Bird Box" class="loadlate" data-tconst="tt2737304" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BMjAzMTI1MjMyN15BMl5BanBnXkFtZTgwNzU5MTE2NjM@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt2737304/?ref_=adv_li_tt">Bird Box</a>
<span class="lister-item-year text-muted unbold">(2018)</span>
</h3>
<p class="text-muted ">
<span class="certificate">R</span>
<span class="ghost">|</span>
<span class="runtime">124 min</span>
<span class="ghost">|</span>
<span class="genre">

`sci_fi_containers` is an overarching `<div>` tag, of which there are 50. Underneath that, is a `<div>` tag minus the movie cover image and such on the floating left element. The title of the movie is an `<a>` tag embedded under an `<h3>` tag, so we can stack bs4 methods `.h3` and `.a`, and finally `.text` to get at the string `'Bird Box'`, the number one for Sci-Fi as of 1/17/19.

In [9]:
movie_one.h3.a.text

'Bird Box'

Next I will grab the year of the first movie, rating, runtime, genre, IMDB rating, metascore, and number of votes.

In [10]:
movie_one_year = movie_one.h3.find('span', class_= 'lister-item-year text-muted unbold')

In [11]:
movie_one_year.text

'(2018)'

In [12]:
movie_one_rating = movie_one.p.find('span', class_= 'certificate')

In [13]:
movie_one_rating.text

'R'

In [14]:
movie_one_runtime = movie_one.p.find('span', class_ = 'runtime')
movie_one_runtime.text

'124 min'

In [15]:
movie_one_genre = movie_one.p.find('span', class_ = 'genre')
movie_one_genre.text

'\nDrama, Horror, Sci-Fi            '

In [16]:
movie_one_imdb_rating = float(movie_one.strong.text)
movie_one_imdb_rating

6.7

In [17]:
movie_one_metascore = movie_one.find('span', class_ = 'metascore mixed')
movie_one_metascore = int(movie_one_metascore.text)
movie_one_metascore

51

In [18]:
#name is a different kind of attribute from class, it requires a dictionary
movie_one_votes = movie_one.find('span', attrs = {'name':'nv'})
movie_one_votes = movie_one_votes['data-value'] #this is without a comma so saves on cleaning later on
movie_one_votes = int(movie_one_votes)
movie_one_votes

151586

Next, I'll remove movies that don't have a metascore to save time later dealing with missing values. The first movie without a metascore is number 6 right now so I will grab that one to see the NoneType and use a conditional around that data type.

In [19]:
first_missing_metascore = sci_fi_containers[6].find('div', class_ = 'ratings-metascore')
type(first_missing_metascore)

NoneType

Next, I will prepare the loop for the first 50 movies.

In [20]:
#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
metascores = []
votes = []

#extract data itemwise
for container in sci_fi_containers:
    
    #conditional for all with metascore
    if container.find('div', class_ = 'ratings-metascore') is not None:
        
        #title
        title = container.h3.a.text
        titles.append(title)
        
        #year released
        year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text
        years.append(year)
        
        #rating
        rating = container.p.find('span', class_= 'certificate').text
        ratings.append(rating)
        
        #genre
        genre = container.p.find('span', class_ = 'genre').text
        genres.append(genre)
        
        #runtime
        time = container.p.find('span', class_ = 'runtime').text
        runtimes.append(time)
        
        #IMDB ratings
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        
        #Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))

        #Number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))        

Don't forget to check your work!

In [21]:
import pandas as pd

test_df = pd.DataFrame({'movie': titles,
                       'year': years,
                       'rating': ratings,
                       'genre': genres,
                       'runtime': runtimes,
                       'imdb': imdb_ratings,
                       'metascore': metascores,
                       'votes': votes})
print(test_df.info())
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 8 columns):
movie        21 non-null object
year         21 non-null object
rating       21 non-null object
genre        21 non-null object
runtime      21 non-null object
imdb         21 non-null float64
metascore    21 non-null int64
votes        21 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 964.0+ bytes
None


Unnamed: 0,movie,year,rating,genre,runtime,imdb,metascore,votes
0,Bird Box,(2018),R,"\nDrama, Horror, Sci-Fi",124 min,6.7,51,151586
1,Aquaman,(2018),PG-13,"\nAction, Adventure, Fantasy",143 min,7.5,55,131703
2,Glass,(2019),PG-13,"\nDrama, Mystery, Sci-Fi",129 min,7.4,42,7019
3,Spider-Man: Into the Spider-Verse,(2018),PG,"\nAnimation, Action, Adventure",117 min,8.7,87,79794
4,Bumblebee,(2018),PG-13,"\nAction, Adventure, Sci-Fi",114 min,7.3,66,34919


Awesome, it worked. But there are 142,000+ movies in the Sci-Fi genre. I want more! Let's look at what parameters need to be passed to send requests for the next page(s) and not forget to control the requests so we don't bombard the server.

In [22]:
from time import sleep
from random import randint

In [29]:
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

#for this example, I only have it running as 
pages = np.arange(1, 101, 50) #can only go to 10000 items because after that the URI ha no discernable distinction
pages

#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
metascores = []
votes = []

iterations = 0

for page in pages:
    
    #get request
    response = get("https://www.imdb.com/search/title?genres=sci-fi&" 
                   + "start=" 
                   + str(page) 
                   + "&explore=title_type,genres&ref_=adv_prv")
    
    sleep(randint(8,15))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:

        #conditional for all with metascore
        if container.find('div', class_ = 'ratings-metascore') is not None:

            #title
            title = container.h3.a.text
            titles.append(title)

            #year released
            year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text
            years.append(year)

            #rating
            rating = container.p.find('span', class_= 'certificate').text
            ratings.append(rating)

            #genre
            genre = container.p.find('span', class_ = 'genre').text
            genres.append(genre)

            #runtime
            time = container.p.find('span', class_ = 'runtime').text
            runtimes.append(time)

            #IMDB ratings
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)

            #Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))

            #Number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))
    iterations += 1
    print("Finished iteration: " + str(iterations))

Finished iteration: 1
Finished iteration: 2


In [30]:
#timer didn't work, so an unelegant calculation of approx. runtime for the full dataset of 9,999 items
num_full_iterations = 9951/50

avg_iteration = np.mean([16.34, 11.52, 14.71, 16.22])
avg_iteration

num_minutes = (avg_iteration/60)*num_full_iterations
num_minutes #approximate

48.751607500000006

In [31]:
test_df = pd.DataFrame({'movie': titles,
                       'year': years,
                       'rating': ratings,
                       'genre': genres,
                       'runtime_min': runtimes,
                       'imdb': imdb_ratings,
                       'metascore': metascores,
                       'votes': votes})
print(test_df.info())
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 8 columns):
movie          45 non-null object
year           45 non-null object
rating         45 non-null object
genre          45 non-null object
runtime_min    45 non-null object
imdb           45 non-null float64
metascore      45 non-null int64
votes          45 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 2.0+ KB
None


Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes
0,Bird Box,(2018),R,"\nDrama, Horror, Sci-Fi",124 min,6.7,51,151586
1,Aquaman,(2018),PG-13,"\nAction, Adventure, Fantasy",143 min,7.5,55,131703
2,Glass,(2019),PG-13,"\nDrama, Mystery, Sci-Fi",129 min,7.4,42,7019
3,Spider-Man: Into the Spider-Verse,(2018),PG,"\nAnimation, Action, Adventure",117 min,8.7,87,79794
4,Bumblebee,(2018),PG-13,"\nAction, Adventure, Sci-Fi",114 min,7.3,66,34919


In [32]:
test_df.loc[:, 'year'] = test_df['year'].str[-5:-1].astype(int)

In [33]:
#there are no crazy looking outliers, so we are good here
test_df.describe().loc[['min', 'max'], ['imdb', 'metascore']]

Unnamed: 0,imdb,metascore
min,5.0,18.0
max,8.8,90.0


In [35]:
#get imdb rating into the same scale as metascore to be able to compare them
test_df['n_imdb'] = test_df['imdb'] * 10
test_df.head(3)

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Bird Box,2018,R,"\nDrama, Horror, Sci-Fi",124 min,6.7,51,151586,67.0
1,Aquaman,2018,PG-13,"\nAction, Adventure, Fantasy",143 min,7.5,55,131703,75.0
2,Glass,2019,PG-13,"\nDrama, Mystery, Sci-Fi",129 min,7.4,42,7019,74.0


In [36]:
#remove " min" from the end of the runtime variable and make them integers
test_df['runtime_min'] = test_df['runtime_min'].apply(lambda x: x.replace(" min", "")).apply(lambda x: int(x))

In [37]:
test_df['genre'] = test_df['genre'].apply(lambda x: x.replace("\n", ""))

In [38]:
test_df['n_imdb'] = test_df['n_imdb'].apply(lambda x: int(x))
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 9 columns):
movie          45 non-null object
year           45 non-null int32
rating         45 non-null object
genre          45 non-null object
runtime_min    45 non-null int64
imdb           45 non-null float64
metascore      45 non-null int64
votes          45 non-null int64
n_imdb         45 non-null int64
dtypes: float64(1), int32(1), int64(4), object(3)
memory usage: 2.5+ KB


In [39]:
test_df.head()

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Bird Box,2018,R,"Drama, Horror, Sci-Fi",124,6.7,51,151586,67
1,Aquaman,2018,PG-13,"Action, Adventure, Fantasy",143,7.5,55,131703,75
2,Glass,2019,PG-13,"Drama, Mystery, Sci-Fi",129,7.4,42,7019,74
3,Spider-Man: Into the Spider-Verse,2018,PG,"Animation, Action, Adventure",117,8.7,87,79794,87
4,Bumblebee,2018,PG-13,"Action, Adventure, Sci-Fi",114,7.3,66,34919,73


In [40]:
test_df['genre'] = test_df['genre'].apply(lambda x: x.rstrip())

In [42]:
#don't need the non-standardized imdb column
test_df.drop(columns='imdb')
test_df.head()

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Bird Box,2018,R,"Drama, Horror, Sci-Fi",124,6.7,51,151586,67
1,Aquaman,2018,PG-13,"Action, Adventure, Fantasy",143,7.5,55,131703,75
2,Glass,2019,PG-13,"Drama, Mystery, Sci-Fi",129,7.4,42,7019,74
3,Spider-Man: Into the Spider-Verse,2018,PG,"Animation, Action, Adventure",117,8.7,87,79794,87
4,Bumblebee,2018,PG-13,"Action, Adventure, Sci-Fi",114,7.3,66,34919,73


In [47]:
from sqlalchemy import create_engine
import sqlite3
engine = create_engine('sqlite://', echo=False)

In [48]:
test_df.to_sql('movies', con=engine)

In [49]:
engine.execute("SELECT * FROM movies").fetchall()

[(0, 'Bird Box', 2018, 'R', 'Drama, Horror, Sci-Fi', 124, 6.7, 51, 151586, 67),
 (1, 'Aquaman', 2018, 'PG-13', 'Action, Adventure, Fantasy', 143, 7.5, 55, 131703, 75),
 (2, 'Glass', 2019, 'PG-13', 'Drama, Mystery, Sci-Fi', 129, 7.4, 42, 7019, 74),
 (3, 'Spider-Man: Into the Spider-Verse', 2018, 'PG', 'Animation, Action, Adventure', 117, 8.7, 87, 79794, 87),
 (4, 'Bumblebee', 2018, 'PG-13', 'Action, Adventure, Sci-Fi', 114, 7.3, 66, 34919, 73),
 (5, 'Escape Room', 2019, 'PG-13', 'Drama, Mystery, Sci-Fi', 99, 6.4, 49, 4347, 64),
 (6, 'Replicas', 2018, 'PG-13', 'Crime, Mystery, Sci-Fi', 107, 5.5, 18, 5940, 55),
 (7, 'Avengers: Infinity War', 2018, 'PG-13', 'Action, Adventure, Fantasy', 149, 8.5, 68, 568940, 85),
 (8, 'Venom', 2018, 'PG-13', 'Action, Sci-Fi', 112, 6.8, 35, 213996, 68),
 (9, 'A Quiet Place', 2018, 'PG-13', 'Drama, Horror, Mystery', 90, 7.6, 82, 259774, 76),
 (10, 'Annihilation', 2018, 'R', 'Adventure, Drama, Horror', 115, 6.9, 79, 202045, 69),
 (11, 'Solo: A Star Wars Story