# Web Scraping Code

In [None]:
import pandas as pd
import requests
import os
from selenium import webdrive
import re
import datetime
import time, random
from bs4 import BeautifulSoup as bs
import pickle


### Chapter 3 : [Intro BeautifulSoup](https://app.thisismetis.com/courses/144/pages/chapter-3-intro-to-beautifulsoup)

In [None]:
# Locate specific tag elements
.find() # first instance only
.find_all() # all instances

# Extract text
.find().text # only work with find, for .find_all(), iterate over result


In [None]:
# for loop to extract text
for item in soup.find_all('li'):
    print(item.text)
    

In [None]:
# list comprehension to extract text

[item.text for item in soup.find_all('li')]

### Chapter 4 : [More BeautifulSoup Functionality](https://app.thisismetis.com/courses/144/pages/chapter-4-more-beautifulsoup-functionality)

In [None]:
# find anchor tags
soup.find_all('a')

# find element by attribute
dickens = soup.find(id='dickens')

# find element within element
dickens.find_all('a')

# chain methods
soup.find(id='dickens').find_all('a')

# extract attribute values
soup.find('a')
returns <a href="https...">Pride and Prejudice</a>

soup.find('a')['href']
returns https...

In [None]:
# for loop to extract text

for link in soup.find(id='dickens').find_all('a'):
    print(link['href'])


In [None]:
# list comprehension to extract text

[link['href'] for link in soup.find(id='dickens').find_all('a')]

In [None]:
# find element by class attribute

soup.find_all(class_='author')

# find elements by multiple attributes

soup.find_all(attrs={'class':'author', 'style':'font-size: 14px'})


### Chapter 5 : [Retriveing HTML with Requests](https://app.thisismetis.com/courses/144/pages/chapter-5-retrieving-html-with-requests)

In [None]:
# requests html as response 

import requests

url = 'https://www.thisismetis.com/blog'

response = requests.get(url)

status = response.status_code

if status == 200:
  page = response.text
  soup = bs(page)
else:
  print(f"Oops! Received status code {status}")


In [None]:
# save html from requests

page_html = response.text

In [None]:
# parse with BS

import bs4 import BeautifulSoup as bs

soup = bs(page_html)

In [None]:
# extract html information

soup.find('h1')
soup.find('h1').text

### Chapter 6 : [Locating by Text and DOM Position](https://app.thisismetis.com/courses/144/pages/chapter-6-locating-by-text-and-dom-position)

In [None]:
# locate information by text

soup.find(text='Welcome to Metis') # string must match exactly

In [None]:
# locate information with regular expresssion (regex)

soup.find(text=re.compile('Welcome')) # returns string with welcome

In [None]:
# locate information by position

soup.find(text='Welcome to Metis').next # returns next element in code
soup.find(text='Welcome to Metis').next.next

In [None]:
# locate information by position-DOM

welcome = soup.find(text='Welcome to Metis')
welcome.parent
welcome.parent.parent

### Chapter 6 : [Locating by Text and DOM Position Exercises](http://localhost:8888/notebooks/Documents/GitHub/metis_dsml/02_regression/regression_exercises/webscraping-chapter6-exercises.ipynb)

In [None]:
global_team_roles = [role.text for role in global_support.parent.parent.find_all('h5', class_='role')]

print(global_team_roles)
type(global_team_roles)

In [None]:
global_support = bs(requests.get(url).text).find(text='Global Support')


### Chapter 7 : [Data Preparation](https://app.thisismetis.com/courses/144/pages/chapter-7-data-preparation?module_item_id=5585)

In [None]:
# find data by text and position

raw_mhi = \
    soup.find(text='Median household income'
             ).next.next
print(raw_mhi)

In [None]:
# extract income string

raw_mhi.split()[0] #returns first element of split

In [None]:
# remove dollar sign and commas

mhi_string = (raw_mhi.split()[0]
                     .replace('$', '')
                     .replace(',', ''))
print(mhi_string)

In [None]:
# convert to integer

mhi = int(mhi_string)

In [1]:
# create custom function to extract item, clean characters, and convert to int

def get_mhi(soup):
    try:
        raw_mhi = soup.find(
                    text = 'Median household income'
                    ).next.next.text
        mhi_string = (raw_mhi.split()[0]
                             .replace('$', '')
                             .replace(',', ''))
        return int(mhi_string)
    except:
        return None 

### Chapter 7 : [Data Preparation Exercise](http://localhost:8888/notebooks/Documents/GitHub/metis_dsml/02_regression/regression_exercises/webscraping-chapter7-exercises.ipynb)

In [None]:
author_list = [post.text
                .split('•')[0]
                .replace('By', ' ')
                .strip()
                for post in soup.find_all(class_= 'blog-post-details')[:5]]
author_list


### Chapter 9 : [Web Scraping Pitfalls](https://app.thisismetis.com/courses/144/pages/chapter-9-web-scraping-pitfalls)

In [None]:
# add pauses

import time, random

time.sleep(2) # pause two seconds
time.sleep(1 + 2*random.random()) # pause randomly

In [None]:
# try/except for error handling

def get_mhi(soup):
    try:
        raw_mhi = soup.find(
                    text = 'Median household income'
                    ).next.next.text
        mhi_string = (raw_mhi.split()[0]
                             .replace('$', '')
                             .replace(',', ''))
        return int(mhi_string)
    except:
        return None 

In [None]:
# serialize data with pickle

import pickle

# serialize data object

with open('data.pickle', 'wb') as f:
    pickle.dump(data, f)
    
# read data object from pickle file

with open('data.pickle', 'rb') as g:
    new_data = pickle.load(g)

### Chapter 10 : [Intro to Selenium](https://app.thisismetis.com/courses/144/pages/chapter-10-intro-to-selenium?module_item_id=5588)

In [None]:
import os
from selenium import webdrive

# provide path to chromedriver

chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = \
    chromedriver

driver = webdriver.Chrome(chromedriver)

driver.get('https://wwww.thisismetis.com') # launches browser window

print(driver.page_source[:100]) # extract html

In [None]:
apply_button = driver.find_element_by_xpath(
    '//a[@id="application-link"]')
apply_button.click()


# [BeauifulSoup Notebook Lesson](http://localhost:8888/notebooks/Documents/GitHub/metis_dsml/02_regression/regression_exercises/web_scraping_beautifulsoup.ipynb)

In [None]:
# list comprehension to extract text of all list items

todos=[element.text for element in soup.find_all('li')]


### Scraping Multiple Pages
We'll also combine all previous steps into one helper function.

In [None]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date]))

    return movie_dict

In [None]:
# Pass each link stub to this function

g_movies_page_info_list = []

for link in g_movies.link_stub:
    g_movies_page_info_list.append(get_movie_dict(link))

In [None]:
g_movies_page_info_list

In [None]:
g_movies_page_info = pd.DataFrame(g_movies_page_info_list)  #convert list of dict to df
g_movies_page_info.set_index('movie_title', inplace=True)

g_movies_page_info

In [None]:
# (Note: the rating is indeed missing from a few of these pages!  How could you fix that?)
# We can now match this back up with the movie information collected 
# from the table by merging these dataframes.

g_movies = g_movies.merge(g_movies_page_info, left_index=True, right_index=True)

g_movies

# [BeautifulSoup Docs](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

In [None]:
soup.title
# <title>The Dormouse's story</title>

soup.title.name
# u'title'

soup.title.string
# u'The Dormouse's story'

soup.title.parent.name
# u'head'

soup.p
# <p class="title"><b>The Dormouse's story</b></p>

soup.p['class']
# u'title'

soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [2]:
# One common task is extracting all the URLs found within a 
# page’s <a> tags:

for link in soup.find_all('a'):
    print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie

In [None]:
# Another common task is extracting all the text from a page:

print(soup.get_text())
# The Dormouse's story
#
# The Dormouse's story
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
# ...