In [None]:
# Before we begin, run this cell if you are using Colab
!git clone -b 3-ysi-tutorial https://github.com/nestauk/im-tutorials.git

Cloning into 'im-tutorials'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (158/158), done.[K
remote: Total 230 (delta 95), reused 184 (delta 55), pack-reused 0[K
Receiving objects: 100% (230/230), 10.54 MiB | 16.78 MiB/s, done.
Resolving deltas: 100% (95/95), done.


# Web Scraping

In [None]:
from IPython.core.display import display, HTML

In [None]:
display(HTML("""
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
  <title>Intro to HTML</title>
</head>

<body>
  <h1>Sample Website</h1>
  <h2>Heading h2</h2>
  <h3>Heading h3</h3>
  <h4>Heading h4</h4>

  <p>
    That's a text paragraph. You can also <b>bold</b>, <mark>mark</mark>, <ins>underline</ins>, <del>strikethrough</del> and <i>emphasize</i> words.
    You can also add links - here's one to <a href="https://en.wikipedia.org/wiki/Main_Page">Wikipedia</a>.
  </p>

  <p>
    This <br> is a paragraph <br> with <br> line breaks
  </p>

  <p style="color:red">
    Add colour to your paragraphs.
  </p>

  <p>Unordered list:</p>
  <ul>
    <li>Python</li>
    <li>R</li>
    <li>Julia</li>
  </ul>

  <p>Ordered list:</p>
  <ol>
    <li>Data collection</li>
    <li>Exploratory data analysis</li>
    <li>Data analysis</li>
    <li>Policy recommendations</li>
  </ol>
  <hr>

  <!-- This is a comment -->

</body>
</html>
"""))

In [None]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# IMDB's homepage
imdb_url = 'https://www.imdb.com'

# Use requests to retrieve data from a given URL
imdb_response = requests.get(imdb_url)

# Parse the whole HTML page using BeautifulSoup
imdb_soup = BeautifulSoup(imdb_response.text, 'html.parser')

# Title of the parsed page
imdb_soup.title

<title>IMDb: Ratings, Reviews, and Where to Watch the Best Movies &amp; TV Shows</title>

In [None]:
# We can also get it without the HTML tags
imdb_soup.title.string

'IMDb: Ratings, Reviews, and Where to Watch the Best Movies & TV Shows'

### Collect trailers' title and description

In [None]:
trailers = imdb_soup.find('div', {'class': 'ab_hero'})

In [None]:
# print(trailers.prettify())

We will use the `.find_all()` method to search the HTML tree for particular tags and get a `list` with all the relevant objects.

In [None]:
for title, image in zip(trailers.find_all('div', {'class': 'onoverflow'}), trailers.find_all('img', {'class': 'pri_image'})):
    print(f"{title.text}: {image['title']}")
    print()

### Collect side bar

In [None]:
for widget in imdb_soup.find_all('div', {'class': 'aux-content-widget-2'}):
    # Check that the widget has a heading
    if widget.h3:
        # Print the widget's heading along with the movie titles.
        print(widget.h3.string)
        for title in widget.find_all('div', {'class': 'title'}):
            print(title.text)
        print()

### Collect articles

In [None]:
for article in imdb_soup.find_all('div', {'class': 'article'}):
    if article.h3:
        # Title of the article
        print(article.h3.string)
        # Text
        print(article.p.text)
        print()

### Find links

In many cases, it is useful to collect the links contained in a webpage (for example, you might want to scrape them too). Here is how you can do this.

In [None]:
# Find all links
links = [link.get('href') for link in imdb_soup.find_all('a')]

# Add homepage and keep the unique links
fixed_links = set([''.join([imdb_url, link]) for link in links if link])

In [None]:
# fixed_links

## Data to analysis


In [None]:
# Box Office Mojo - UK Weekend box office
boxofficemojo_url = 'https://www.boxofficemojo.com/intl/uk/?yr=2019&wk=33&currency=local'

# Use requests to retrieve data from a given URL
bom_response = requests.get(boxofficemojo_url)

# Parse the whole HTML page using BeautifulSoup
bom_soup = BeautifulSoup(bom_response.text, 'html.parser')

In [None]:
# There are 7 tables in the Box Office Mojo page but we are interested in the one with the most data (table 5).
print(f"NUMBER OF TABLES IN THE PAGE: {len(bom_soup.find_all('table'))}")

In [None]:
# Python starts counting from 0
table = bom_soup.find_all('table')[4]

In [None]:
# table

In [None]:
# Using the .contents method
table.find_all('tr')[2].contents

In [None]:
# Using .text method
table.find_all('tr')[2].text

In [None]:
# Print text "consumes" the newline characters
print(table.find_all('tr')[2].text)

In [None]:
# Split string on newline characters
table.find_all('tr')[2].text.split('\n')

In [None]:
# Loop through the cells of a row and print their data
for data in table.find_all('tr')[2].find_all('td'):
    print(data.text)

In [None]:
# Table's column names
for data in table.find_all('tr')[1].find_all('td'):
    print(data.text)

In [None]:
# Loop over the table rows, collect the data and store them in a list.
lst = []
for row in table.find_all('tr')[1:-1]:
    s = pd.Series([data.text for data in row.find_all('td')])
    lst.append(s)

In [None]:
# Concatenate the Pandas Series in a DataFrame
data = pd.concat(lst, axis=1).T

In [None]:
# The first line contains the header - let's fix that!
data.head(2)

In [None]:
# grab the first row for the header
header = []
for col in data.iloc[0, :-1]:
    if '/' not in col:
        header.append(col)
    else:
        header.extend(col.split('/'))

data = data[1:] # take the data less the header row
data.columns = header # set the header row as the df header

In [None]:
# Replace the n/a string with a Null value.
data.replace('n/a', np.nan, inplace=True)
data.replace('-', np.nan, inplace=True)

# Remove the £ symbol from the "Gross" column and transform strings to integers
data['Weekend Gross'] = data['Weekend Gross'].apply(lambda x: int(x[1:].replace(',', '')))
data['Gross-to-Date'] = data['Gross-to-Date'].apply(lambda x: int(x[1:].replace(',', '')))

# Transform strings to integers
data['Theaters'] = data['Theaters'].apply(lambda x: int(x) if isinstance(x, str) else x)
data['Week'] = data['Week'].apply(lambda x: int(x) if isinstance(x, str) else x)

# Create a new variable showing how much a movie grossed on average on weekly basis
data['Week AVG'] = data['Gross-to-Date'].div(data['Week'])

# Set the movie title as index
data.set_index('Movie', inplace=True)

In [None]:
data.head(2)

In [None]:
print(f'(MOVIES, COLUMNS) -> {data.shape}')

In [None]:
print(f'% OF MISSING VALUES PER COLUMN\n{(data.isnull().sum() / data.shape[0]) * 100}')

In [None]:
# Use the .value_counts() method to count the number of studios
data.Studio.value_counts().plot(kind='bar', title='Studios with the most movies in the top 55')
plt.show()

In [None]:
# Use the .sort_values() method to sort the values of a column
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,8))

# ax1
data['Week AVG'].sort_values(ascending=False)[:25].plot(kind='bar', title='Weekly Gross earnings', ax=ax1)
# ax2
data['Theaters'].sort_values(ascending=False)[:25].plot(kind='bar', title='Number of theaters showing a movie', ax=ax2)

f.tight_layout()
plt.show()

## USING SELENIUM IN COLAB

In [None]:
# # RUN THIS CELL WHEN USING THE NOTEBOOK LOCALLY - YOU SHOULD INSTALL SELENIUM FIRST
# import selenium.webdriver
# # Path to the Chrome driver for my Mac -- yours will differ
# mac_path = '../../chromedriver'
# driver = selenium.webdriver.Chrome(executable_path=mac_path)

In [None]:
# # RUN THIS CELL WHEN USING THE NOTEBOOK ON COLAB - NO PREVIOUS INSTALLATION OF SELENIUM IS NEEDED
# # install chromium, its driver, and selenium
# !apt update
# !apt install chromium-chromedriver
# !pip install selenium
# # set options to be headless
# from selenium import webdriver
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
# # open it, go to a website, and get results
# driver = webdriver.Chrome('chromedriver',options=options)

In [None]:
def html2df(source, q):
    """A wrapper of the scraping pipeline we used before."""
    # Parse the HTML page
    soup = BeautifulSoup(source, 'html.parser')

    # Choose the relevant table
    table = soup.find_all('table')[4]

    # Parse and store the data of every table row
    lst = []
    for row in table.find_all('tr'):
        s = pd.Series([data.text for data in row.find_all('td')])
        lst.append(s)

    # Concatenate the data in a Pandas DataFrame and place the first row of the DataFrame as header.
    data = pd.concat(lst, axis=1).T

    # Grab the first row for the header
    new_header = data.iloc[0]

    # Take the data less the header row
    data = data[1:]

    # Set the header row as the df header
    data.columns = new_header
    
    # Add a new column tagging the page we scraped
    data['page'] = q 
    
    return data

In [None]:
# URL to use in Selenium
driver.get('https://www.boxofficemojo.com/intl/uk/yearly/')

In [None]:
lst = []
lst.append(html2df(driver.page_source, '#1'))
for i in ['#101', '#201', '#301', '#401']:
    # Locate Hyperlinks by partial link text
    elem = driver.find_element_by_partial_link_text(i)
    # Click on the next page
    elem.click()
    # Store the Pandas DataFrame with the scraped content in a list
    lst.append(html2df(driver.page_source, i))

# Concatenate all Pandas DataFrames
df = pd.concat(lst)

In [None]:
df.head(2)

In [None]:
print(f'(MOVIES, COLUMNS) -> {df.shape}')

## TEST

Use Selenium to scrape Box Office Mojo's top \#100 for every year between 2002 and 2019.


In [None]:
url = 'https://www.boxofficemojo.com/intl/uk/yearly/'

In [None]:
print(requests.get('https://www.nesta.org.uk/robots.txt').text)
print('-----')
print(requests.get('https://www.boxofficemojo.com/robots.txt').text)
print('-----')
print(requests.get('https://www.howtogeek.com/robots.txt').text)

In [None]:
headers = {
    'User-Agent': 'Kostas Stathoulopoulos bot',
    'From': 'konstantinos.stathoulopoulos@nesta.org.uk'
}
request = requests.get('https://www.nesta.org.uk/', headers=headers)
print(request.request.headers)