In [1]:
# We begin with a simple example of how to use BeautifulSoup to parse a downloaded html page.

from bs4 import BeautifulSoup

soup = BeautifulSoup(open('2014102803.html'), 'html.parser')

In [2]:
# We could also do this off a requested page.
# Note this is not the same play-by-play log corresponding to the html page opened above.

# import requests
#
# r = requests.get('http://knbr.stats.com/nba/pbp.asp?gamecode=2016102701&home=1&vis=27')
# soup = BeautifulSoup(r.content)

In [3]:
# We parse the page this linearly by traversing the tags in the html tree.

firstPlayer = soup.tr.next_sibling.next_sibling
print(firstPlayer.text)

112:00NOStarting Lineup - Eric Gordon00


In [4]:
# As with most trees, you can access the parents, siblings, and children of a node (represented by tags).

secondPlayer = firstPlayer.next_sibling
print(secondPlayer.text)

112:00NOStarting Lineup - Omer Asik00


In [5]:
# We can also do parsing with the 'find' function.

player = firstPlayer
while "Starting Lineup" in player.text:
    name = player.find_all("td", {"class" : "shsNamD"})[2].text
    name = name[name.find('-')+2:]
    print(name)
    
    player = player.next_sibling

Eric Gordon
Omer Asik
Jrue Holiday
Tyreke Evans
Anthony Davis
Nikola Vucevic
Kyle O'Quinn
Tobias Harris
Elfrid Payton
Evan Fournier


In [13]:
# Moving on to Selenium...
# This module helps with things need to be loaded dynamically or when a website stores login cookies.
# The example we will use is of scraping some product data off a Safeway website. 

import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# start chrome driver and navigate to the beer-cider page
driver = webdriver.Chrome('./chromedriver')
driver.get("https://shop.safeway.com/ecom/shop-by-aisle")

In [14]:
# Now we show a full example of how we could scrape a page dynamically clicking & combining BeautifulSoup.

# ... enter email and password & login
enter_email = driver.find_element_by_id("SignIn_EmailAddress")
enter_email.send_keys("srodriguez48@ucmerced.edu")
enter_pass = driver.find_element_by_id("SignIn_Password")
enter_pass.send_keys("SAMPLe2017")
signIn_click = driver.find_element_by_name("signInButton")
signIn_click.click()

In [15]:
# Lets define some functions for data scraping...
# Define function for writting row.

# write row to csv file
def writer (row):
    with open('safewayScrape.csv','a') as outfile:
        rowWriter = csv.writer(outfile, delimiter = ',')
        rowWriter.writerow(row)

In [16]:
# We define a function to try to load a page.
# To see if a page is loaded, we query a specific XPATH address of the html page.
# If we find it, we proceed; if we dont find it after 5 seconds, an exception is raised.

# try to load a page
# if we cant find queried object, search for products
# if cant find products, refresh page and try again (max 5 times)
def tryLoad (driver, query, i):
    if (i == 5):
        return i
    wait5 = WebDriverWait(driver, 5)
    try:
        next_page = wait5.until(EC.presence_of_element_located((By.XPATH, query)))
        return i
    except TimeoutException:
        tryFindProduct = "//div[@class='widget widget-type-section level-2 id-productItem state-commandAdd']/div[@class='widget-content']/form" 
        if (tryLoad(driver, tryFindProduct, 4) == 4):
            return 6
        print("Page not loaded... try ", i)
        driver.refresh()
        return tryLoad (driver, query, i+1)

In [17]:
# This is a recursive function to go down the SafeWay category tree.
# I find html elements by their XPATH using Selenium.

# main recursive function that gets us down the category tree
# keeps track of levels (depth of category tree), builds csv row for writing
def goDownCat (driver, level, row):
    if (level == 5):
        getProducts(driver, level, row)
        return

    driver_cat_url = driver.current_url
    query = "//li[@class='level-" + str(level) + "']/a/img"

    t = tryLoad(driver, query, 0)
    if (t == 5):
        print("Error! Page would not load after 5 tries")
        quit()
    elif (t == 6):
        getProducts(driver, level, row)
        return

    cat_list_addr = driver.find_elements_by_xpath("//li[@class='level-" + str(level) + "']/a/span[@class='id-name']")
    cat = cat_list_addr[0]
    category = cat.text
    print(category)
    cat.click()

    tmpRow = row.copy()
    tmpRow.append(category)
    goDownCat(driver, level+1, tmpRow)


In [18]:
# Scraping data with Selenium is slow.
# We extract the html content from the Selenium webdriver object and stick it in BeautifulSoup for processing.
# This achieves great speedup.

# once products are detected on the page, scrape the necessary info and write into csv file
def getProducts (driver, level, row):
    if (level < 5):
        subsubcategory = "NA"
        row.append(subsubcategory)
    if (level < 4):
        subcategory = "NA"
        row.append(subcategory)

    query = "//div[@class='widget widget-type-section level-2 id-productItem state-commandAdd']/div[@class='widget-content']/form" 
    tryLoad (driver, query, 0)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    product_name_list = [x.find("span").text for x in [y.find("div", {"class" : "widget-header"}) for y in soup.find_all("div", {"class" : "widget widget-type-section level-2 id-productItem state-commandAdd"})]]
    product_content_list = [x.find("form") for x in [y.find("div", {"class" : "widget-content"}) for y in soup.find_all("div", {"class" : "widget widget-type-section level-2 id-productItem state-commandAdd"})]]
    for prod in range(0, len(product_name_list)):
        writeRow = row.copy()

        product_content = product_content_list[prod]

        product_id = product_content.find("input", {"id" : "Id"}).get("value")
        product_name = product_name_list[prod]
        product_pic = "https://shop.safeway.com" + product_content.find("div", {"class" : "id-image"}).find("img").get("data-sdc-src-state-ui-richinfo")

        product_price_desc = product_content.find("span", {"class" : "id-priceDescription"}).text
        product_price_desc = product_price_desc[(product_price_desc.find("(")+1):product_price_desc.find(")")]
        product_price = product_content.find("input", {"id" : "Price"}).get("value")
        if (product_price.find("/") > -1):
            product_price_desc = product_price.lower()
            product_price = "NA"

        try:
            product_sale_desc = product_content.find("div", {"class" : "id-promo"}).find("span", {"class" : "id-text"}).text
            product_sale_desc = ' '.join(product_sale_desc.split("\n"))
        except AttributeError:
            product_sale_desc = "NA"
        try:
            product_sale_end_date = product_content.find("span", {"class" : "id-endDate"}).text
            product_sale_end_date = product_sale_end_date[(product_sale_end_date.find("through ")+8):(product_sale_end_date.find(")"))]
        except AttributeError:
            product_sale_end_date = "NA"

        product_measure = product_content.find("input", {"id" : "MeasureType"}).get("value").strip()

        writeRow.insert(0, product_id)
        writeRow = np.concatenate((writeRow, [product_name, product_pic, product_price, product_sale_desc, product_sale_end_date, product_price_desc, product_measure]), axis=0)

        print(writeRow)
    quit()

In [19]:
# Start experiment...
goDownCat(driver, 2, [])

Baby Care
Baby Accessories
Bottles & Nursing
['960078852' 'Baby Care' 'Baby Accessories' 'Bottles & Nursing'
 'Avent Bottle Natural 9 Ounce - Each'
 'https://shop.safeway.com/productimages/200x200/960078852_200x200.jpg'
 '12.39' 'NA' 'NA' '$12.39/each' 'Ea']
['960056487' 'Baby Care' 'Baby Accessories' 'Bottles & Nursing'
 'Gerber First Essentials Bottles Silicone Medium Flow 9 Ounce Months Plus - 3 Count'
 'https://shop.safeway.com/productimages/200x200/960056487_200x200.jpg'
 '5.59' 'NA' 'NA' '$1.86/count' 'Ct']
['960013569' 'Baby Care' 'Baby Accessories' 'Bottles & Nursing'
 'Lansinoh Breastmilk Storage Bags - 50 Count'
 'https://shop.safeway.com/productimages/200x200/960013569_200x200.jpg'
 '13.49' 'NA' 'NA' '$0.27/count' 'Ct']
['165450127' 'Baby Care' 'Baby Accessories' 'Bottles & Nursing'
 'Lansinoh Disposable Nursing Pads - 60 Count'
 'https://shop.safeway.com/productimages/200x200/165450127_200x200.jpg'
 '11.19' 'NA' 'NA' '$0.19/count' 'Ct']
['960123711' 'Baby Care' 'Baby Access