In [3]:
from requests import get

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'

response = get(url)


In [7]:
from bs4 import BeautifulSoup

html_soup = BeautifulSoup(response.text, 'html.parser')

In [17]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [9]:
from time import sleep
from random import randint

In [13]:
from time import time
start_time = time()
requests = 0

for _ in range(5):
    # A request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))

Request: 1; Frequency: 0.33325303211264884 requests/s
Request: 2; Frequency: 0.39992347274321943 requests/s
Request: 3; Frequency: 0.4284801580255954 requests/s
Request: 4; Frequency: 0.44435734748803074 requests/s
Request: 5; Frequency: 0.4544611763992034 requests/s


In [15]:
from IPython.core.display import clear_output

start_time = time()
requests = 0

for _ in range(5):
    # A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

Request: 5; Frequency: 0.35699247350310304 requests/s


In [16]:
from warnings import warn

warn("Warning Simulation")

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(1980,2019)]

In [22]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

In [20]:
from IPython.core.display import clear_output

start_time = time()
requests = 0

for _ in range(5):
    # A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

Request: 5; Frequency: 0.5551294371665494 requests/s


In [23]:

# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url + 
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')  
            break 

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year 
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request:93; Frequency: 0.07526210851288564 requests/s


In [25]:
import pandas as pd

movie_ratings = pd.DataFrame({'movie': names,
                              'year': years,
                              'imdb': imdb_ratings,
                              'metascore': metascores,
                              'votes': votes})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2948 entries, 0 to 2947
Data columns (total 5 columns):
movie        2948 non-null object
year         2948 non-null object
imdb         2948 non-null float64
metascore    2948 non-null int64
votes        2948 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 115.2+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Star Wars: Episode V - The Empire Strikes Back,(1980),8.8,82,1032295
1,The Shining,(1980),8.4,63,764451
2,Raging Bull,(1980),8.2,89,283057
3,The Elephant Man,(1980),8.2,77,197620
4,Airplane!,(1980),7.8,78,187477
5,The Blues Brothers,(1980),7.9,60,165181
6,Friday the 13th,(1980),6.5,22,103500
7,Caddyshack,(1980),7.3,48,95199
8,Superman II,(1980),6.8,87,89468
9,The Blue Lagoon,(1980),5.7,31,57574


In [26]:
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Star Wars: Episode V - The Empire Strikes Back,(1980),8.8,82,1032295
1,The Shining,(1980),8.4,63,764451
2,Raging Bull,(1980),8.2,89,283057
3,The Elephant Man,(1980),8.2,77,197620
4,Airplane!,(1980),7.8,78,187477


In [27]:
movie_ratings['year'].unique()

array(['(1980)', '(1981)', '(I) (1981)', '(1982)', '(1983)', '(1984)',
       '(1985)', '(1986)', '(1987)', '(1988)', '(1989)', '(1990)',
       '(I) (1990)', '(1991)', '(1992)', '(1993)', '(1994)', '(1995)',
       '(I) (1995)', '(1996)', '(1997)'], dtype=object)

In [28]:
movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)

In [29]:
movie_ratings['year'].head(3)

0    1980
1    1980
2    1980
Name: year, dtype: int32

In [30]:
movie_ratings.describe().loc[['min', 'max'], ['imdb', 'metascore']]


Unnamed: 0,imdb,metascore
min,2.9,12.0
max,9.3,100.0


In [31]:
movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10
movie_ratings.head(3)

Unnamed: 0,movie,year,imdb,metascore,votes,n_imdb
0,Star Wars: Episode V - The Empire Strikes Back,1980,8.8,82,1032295,88.0
1,The Shining,1980,8.4,63,764451,84.0
2,Raging Bull,1980,8.2,89,283057,82.0


In [32]:
movie_ratings.to_csv('movie_ratings.csv')