In [9]:
# Our jupyter/datascience-notebook Docker container comes with 
# BeautifulSoup4 and requests, both popular libraries!

from bs4 import BeautifulSoup
import requests

In [10]:
START_URL = 'https://brickset.com/sets/year-2016'

In [103]:
# Exercise #1: Get the titles for each "brickset" on the first page

def get_soup(url):
    contents = requests.get(url).content
    return BeautifulSoup(contents, 'html.parser')

def get_titles(soup):    
    """ Returns a list of titles on the page """
    # the "soup" parameter is of the type that is
    # returned by Beautiful Soup when it parses HTML.
    # The function should then use the object to
    # extract a list of titles (of the lego sets)
    #
    # Lookup the documentation for Beautiful Soup
    # Figure out how to select the text of the title
    # of each legoset. A title should look like: 
    # "10252: Volkswagen Beetle"
    tags = soup.select('h1 a')
    titles = [h1.get_text() for h1 in tags]
    return titles


def parse_bricks(url):
    """ Fetches Lego Bricks page and extracts titles """
    # Lookup the documentation to the "requests" library
    #
    # Use requests to make a get request to the
    # url given in the argument "url" (which is a string)
    # and get the raw HTML body of the response
    #
    # Use "BeautifulSoup" to parse this HTML. 
    #
    # Use the "get_titles" function to extract the
    # titles from the BeautifulSoup object.
    #
    # Return the titles
    soup = get_soup(START_URL)
    return get_titles(soup)

In [104]:
bricks = parse_bricks(START_URL)
bricks

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane']

In [9]:
# Exercise #2

# Now write code that gets you all the links from ALL the pages.

# HINT: you will probably want to extract the URL in the "next" button on 
# the bottom of the search pagination, which looks like ">".

# HINT HINT: Think of the previous exercise on API's and internet data.
# The Pokemon API returned JSON, that we converted to a dictionary, that
# had a nice structure. In particular, there were two top-level keys of interest, 
# one had the "results" in a list, the other was the "next" url to call to get
# more items. If you can replicate this return structure, you will be able to 
# almost reuse the while loop you had there!

# HINT HINT HINT: There's no reason you shouldn't be able to reuse the previous 
# functions (get_titles and parse_bricks)

In [102]:
# note: reusing the previous functions isn't ideal because parse_bricks loads the page but discards the loaded page
# so we'll just reuse get_titles and not parse_bricks

def get_next(soup):
    list_item = soup.find('li', ['next'])
    if list_item: 
        anchor = list_item.find('a')
        if anchor:
            return anchor['href']
    return None

def get_all_titles(start_url):
    next_url = start_url
    titles = []
    while next_url:
        print(next_url)
        soup = get_soup(next_url)
        titles += get_titles(soup)
        next_url = get_next(soup)
    return titles
    
titles = get_all_titles(START_URL)


https://brickset.com/sets/year-2016
https://brickset.com/sets/year-2016/page-2
https://brickset.com/sets/year-2016/page-3
https://brickset.com/sets/year-2016/page-4
https://brickset.com/sets/year-2016/page-5
https://brickset.com/sets/year-2016/page-6
https://brickset.com/sets/year-2016/page-7
https://brickset.com/sets/year-2016/page-8
https://brickset.com/sets/year-2016/page-9
https://brickset.com/sets/year-2016/page-10
https://brickset.com/sets/year-2016/page-11
https://brickset.com/sets/year-2016/page-12
https://brickset.com/sets/year-2016/page-13
https://brickset.com/sets/year-2016/page-14
https://brickset.com/sets/year-2016/page-15
https://brickset.com/sets/year-2016/page-16
https://brickset.com/sets/year-2016/page-17
https://brickset.com/sets/year-2016/page-18
https://brickset.com/sets/year-2016/page-19
https://brickset.com/sets/year-2016/page-20
https://brickset.com/sets/year-2016/page-21
https://brickset.com/sets/year-2016/page-22
https://brickset.com/sets/year-2016/page-23
http