In [5]:
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
db = dataset.connect('sqlite:///quotes.db')
authors_seen = set()
base_url = 'http://quotes.toscrape.com/'
def clean_url(url):
    print("inside clean_url")
    # Clean '/author/Steve-Martin' to 'Steve-Martin'
    # Use urljoin to make an absolute URL
    url = urljoin(base_url, url)
    print(url)
    # Use urlparse to get out the path part
    path = urlparse(url).path
    print(path)
    # Now split the path by '/' and get the second part
    # E.g. '/author/Steve-Martin' -> ['', 'author', 'Steve-Martin']
    print(path.split('/')[2])
    return path.split('/')[2]
def scrape_quotes(html_soup):
    for quote in html_soup.select('div.quote'):
        quote_text = quote.find(class_='text').get_text(strip=True)
        quote_author_url = clean_url(quote.find(class_='author') \
                                     .find_next_sibling('a').get('href'))
        quote_tag_urls = [clean_url(a.get('href'))
                          for a in quote.find_all('a', class_='tag')]
        authors_seen.add(quote_author_url)
        # Store this quote and its tags
        quote_id = db['quotes'].insert({ 'text' : quote_text,
                                         'author' : quote_author_url })
        db['quote_tags'].insert_many(
                 [{'quote_id' : quote_id, 'tag_id' : tag} for tag in
                 quote_tag_urls])
def scrape_author(html_soup, author_id):

    author_name = html_soup.find(class_='author-title').get_text(strip=True)
    author_born_date = html_soup.find(class_='author-born-date').get_text(strip=True)
    author_born_loc = html_soup.find(class_='author-born-location').get_text(strip=True)
    author_desc = html_soup.find(class_='author-description').get_text(strip=True)
    db['authors'].insert({ 'author_id' : author_id,
                            'name' : author_name,
                            'born_date' : author_born_date,
                            'born_location' : author_born_loc,
                            'description' : author_desc})
# Start by scraping all the quote pages
url = base_url
while True:
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    # Scrape the quotes
    scrape_quotes(html_soup)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))
    # Now fetch out the author information
for author_id in authors_seen:
    url = urljoin(base_url, '/author/' + author_id)
    print('Now scraping author:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    # Scrape the author information
    scrape_author(html_soup, author_id)

Now scraping page: http://quotes.toscrape.com/
inside clean_url
http://quotes.toscrape.com/author/Albert-Einstein
/author/Albert-Einstein
Albert-Einstein
inside clean_url
http://quotes.toscrape.com/tag/change/page/1/
/tag/change/page/1/
change
inside clean_url
http://quotes.toscrape.com/tag/deep-thoughts/page/1/
/tag/deep-thoughts/page/1/
deep-thoughts
inside clean_url
http://quotes.toscrape.com/tag/thinking/page/1/
/tag/thinking/page/1/
thinking
inside clean_url
http://quotes.toscrape.com/tag/world/page/1/
/tag/world/page/1/
world
inside clean_url
http://quotes.toscrape.com/author/J-K-Rowling
/author/J-K-Rowling
J-K-Rowling
inside clean_url
http://quotes.toscrape.com/tag/abilities/page/1/
/tag/abilities/page/1/
abilities
inside clean_url
http://quotes.toscrape.com/tag/choices/page/1/
/tag/choices/page/1/
choices
inside clean_url
http://quotes.toscrape.com/author/Albert-Einstein
/author/Albert-Einstein
Albert-Einstein
inside clean_url
http://quotes.toscrape.com/tag/inspirational/page/1

Now scraping page: http://quotes.toscrape.com/page/3/
inside clean_url
http://quotes.toscrape.com/author/Pablo-Neruda
/author/Pablo-Neruda
Pablo-Neruda
inside clean_url
http://quotes.toscrape.com/tag/love/page/1/
/tag/love/page/1/
love
inside clean_url
http://quotes.toscrape.com/tag/poetry/page/1/
/tag/poetry/page/1/
poetry
inside clean_url
http://quotes.toscrape.com/author/Ralph-Waldo-Emerson
/author/Ralph-Waldo-Emerson
Ralph-Waldo-Emerson
inside clean_url
http://quotes.toscrape.com/tag/happiness/page/1/
/tag/happiness/page/1/
happiness
inside clean_url
http://quotes.toscrape.com/author/Mother-Teresa
/author/Mother-Teresa
Mother-Teresa
inside clean_url
http://quotes.toscrape.com/tag/attributed-no-source/page/1/
/tag/attributed-no-source/page/1/
attributed-no-source
inside clean_url
http://quotes.toscrape.com/author/Garrison-Keillor
/author/Garrison-Keillor
Garrison-Keillor
inside clean_url
http://quotes.toscrape.com/tag/humor/page/1/
/tag/humor/page/1/
humor
inside clean_url
http://qu

Now scraping page: http://quotes.toscrape.com/page/6/
inside clean_url
http://quotes.toscrape.com/author/Jane-Austen
/author/Jane-Austen
Jane-Austen
inside clean_url
http://quotes.toscrape.com/tag/friendship/page/1/
/tag/friendship/page/1/
friendship
inside clean_url
http://quotes.toscrape.com/tag/love/page/1/
/tag/love/page/1/
love
inside clean_url
http://quotes.toscrape.com/author/Eleanor-Roosevelt
/author/Eleanor-Roosevelt
Eleanor-Roosevelt
inside clean_url
http://quotes.toscrape.com/tag/attributed/page/1/
/tag/attributed/page/1/
attributed
inside clean_url
http://quotes.toscrape.com/tag/fear/page/1/
/tag/fear/page/1/
fear
inside clean_url
http://quotes.toscrape.com/tag/inspiration/page/1/
/tag/inspiration/page/1/
inspiration
inside clean_url
http://quotes.toscrape.com/author/Marilyn-Monroe
/author/Marilyn-Monroe
Marilyn-Monroe
inside clean_url
http://quotes.toscrape.com/tag/attributed-no-source/page/1/
/tag/attributed-no-source/page/1/
attributed-no-source
inside clean_url
http://q

inside clean_url
http://quotes.toscrape.com/author/John-Lennon
/author/John-Lennon
John-Lennon
inside clean_url
http://quotes.toscrape.com/tag/beatles/page/1/
/tag/beatles/page/1/
beatles
inside clean_url
http://quotes.toscrape.com/tag/connection/page/1/
/tag/connection/page/1/
connection
inside clean_url
http://quotes.toscrape.com/tag/dreamers/page/1/
/tag/dreamers/page/1/
dreamers
inside clean_url
http://quotes.toscrape.com/tag/dreaming/page/1/
/tag/dreaming/page/1/
dreaming
inside clean_url
http://quotes.toscrape.com/tag/dreams/page/1/
/tag/dreams/page/1/
dreams
inside clean_url
http://quotes.toscrape.com/tag/hope/page/1/
/tag/hope/page/1/
hope
inside clean_url
http://quotes.toscrape.com/tag/inspirational/page/1/
/tag/inspirational/page/1/
inspirational
inside clean_url
http://quotes.toscrape.com/tag/peace/page/1/
/tag/peace/page/1/
peace
inside clean_url
http://quotes.toscrape.com/author/W-C-Fields
/author/W-C-Fields
W-C-Fields
inside clean_url
http://quotes.toscrape.com/tag/humor/

Now scraping author: http://quotes.toscrape.com/author/John-Lennon
Now scraping author: http://quotes.toscrape.com/author/W-C-Fields
Now scraping author: http://quotes.toscrape.com/author/Charles-M-Schulz
Now scraping author: http://quotes.toscrape.com/author/William-Nicholson
Now scraping author: http://quotes.toscrape.com/author/Stephenie-Meyer
Now scraping author: http://quotes.toscrape.com/author/Haruki-Murakami
Now scraping author: http://quotes.toscrape.com/author/Martin-Luther-King-Jr
Now scraping author: http://quotes.toscrape.com/author/Terry-Pratchett
Now scraping author: http://quotes.toscrape.com/author/E-E-Cummings
Now scraping author: http://quotes.toscrape.com/author/Steve-Martin
Now scraping author: http://quotes.toscrape.com/author/Dr-Seuss
Now scraping author: http://quotes.toscrape.com/author/Douglas-Adams
Now scraping author: http://quotes.toscrape.com/author/Helen-Keller
Now scraping author: http://quotes.toscrape.com/author/Mark-Twain
Now scraping author: http://q