## **Load in  the necessary libraries**

In [7]:
import requests
from bs4 import BeautifulSoup as bs

## **Load our first page** 

In [8]:
#load the web page content
r = requests.get("https://quotes.toscrape.com")

#convert to a beautifulsoup object
soup = bs(r.content)

#print out our html
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small>
       <a href="/author/Albert

## **Start using beautifulsoup to scrape**

### find and find_all

In [9]:
first_header = soup.find('h1')
headers = soup.find_all('h1')
print(headers)

[<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>]


In [10]:
# pass in a list of elememts to look for 
first_header= soup.find(['h1','div'])
headers = soup.find_all(['h1','span'])
headers


[<h1>
 <a href="/" style="text-decoration: none">Quotes to Scrape</a>
 </h1>,
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>,
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>,
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>,
 <span>by <small class="author" itemprop="author">J.K. Rowling</small>
 <a href="/author/J-K-Rowling">(about)</a>
 </span>,
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>,
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>,
 <span class="text" itemprop="text">“The person, be it gentleman or la

In [11]:
# you can pass in attributes to the find/find_all function
paragraphs = soup.find_all('p',attrs={'class':'text-muted'})
paragraphs
                           


[<p class="text-muted">
                 Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
 </p>]

In [12]:
# you can nest find/find_all calls 
body = soup.find('body')
div = body.find('div',attrs={'class':'container'})
header = div.find('h1')
header

<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>

In [16]:
# we can search specific strings in our  find/find_all calls
import re
paragraphs = soup.find_all('p',attrs={'class':'copyright'},string = re.compile("Made"))
paragraphs


[]

### select (CSS selector) 

In [17]:
content = soup.select('div p')
content

[<p>
 <a href="/login">Login</a>
 </p>,
 <p class="text-muted">
                 Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
 </p>,
 <p class="copyright">
                 Made with <span class="zyte">❤</span> by <a class="zyte" href="https://www.zyte.com">Zyte</a>
 </p>]

In [23]:
links = soup.select('small ~ a')
links

[<a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/J-K-Rowling">(about)</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/Jane-Austen">(about)</a>,
 <a href="/author/Marilyn-Monroe">(about)</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/Andre-Gide">(about)</a>,
 <a href="/author/Thomas-A-Edison">(about)</a>,
 <a href="/author/Eleanor-Roosevelt">(about)</a>,
 <a href="/author/Steve-Martin">(about)</a>]

In [26]:
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('a'))

[]


In [27]:
# Grab by element with specific property
soup.select('[itemprop=author]')

[<small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Jane Austen</small>,
 <small class="author" itemprop="author">Marilyn Monroe</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">André Gide</small>,
 <small class="author" itemprop="author">Thomas A. Edison</small>,
 <small class="author" itemprop="author">Eleanor Roosevelt</small>,
 <small class="author" itemprop="author">Steve Martin</small>]

### Get different properties of the HTML

In [36]:
body = soup.find('body')
div = body.find('div',attrs={'class':'container'})
div2 = div.find('div',attrs={'class':'row header-box'})
div3 = div2.find('div',attrs={'class':'col-md-8'})
header = div3.find('h1')
a = header.find('a')
print(a.string)


Quotes to Scrape


In [37]:
# if multiple child elements use get_text()
print(div.get_text())





Quotes to Scrape




Login






“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
by Albert Einstein
(about)


            Tags:
            
change
deep-thoughts
thinking
world



“It is our choices, Harry, that show what we truly are, far more than our abilities.”
by J.K. Rowling
(about)


            Tags:
            
abilities
choices



“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
by Albert Einstein
(about)


            Tags:
            
inspirational
life
live
miracle
miracles



“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
by Jane Austen
(about)


            Tags:
            
aliteracy
books
classic
humor



“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
by Marilyn Monroe
(about)


            Ta

In [38]:
# get a specific property from an element
link = soup.find('a')
link['href']


'/'

### Code navigation

In [42]:
# path syntax
print(soup.body.div.h1.string)

None


In [43]:
# know the terms: parents, siblings, child
soup.body.find('div').find_next_siblings()


[<footer class="footer">
 <div class="container">
 <p class="text-muted">
                 Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
 </p>
 <p class="copyright">
                 Made with <span class="zyte">❤</span> by <a class="zyte" href="https://www.zyte.com">Zyte</a>
 </p>
 </div>
 </footer>]

## Grab all the container links from a webpage

#### Do it in two diferent ways

In [46]:
links = soup.select('div.container a')
actual_links = [link['href']for link in links]
actual_links

['/',
 '/login',
 '/author/Albert-Einstein',
 '/tag/change/page/1/',
 '/tag/deep-thoughts/page/1/',
 '/tag/thinking/page/1/',
 '/tag/world/page/1/',
 '/author/J-K-Rowling',
 '/tag/abilities/page/1/',
 '/tag/choices/page/1/',
 '/author/Albert-Einstein',
 '/tag/inspirational/page/1/',
 '/tag/life/page/1/',
 '/tag/live/page/1/',
 '/tag/miracle/page/1/',
 '/tag/miracles/page/1/',
 '/author/Jane-Austen',
 '/tag/aliteracy/page/1/',
 '/tag/books/page/1/',
 '/tag/classic/page/1/',
 '/tag/humor/page/1/',
 '/author/Marilyn-Monroe',
 '/tag/be-yourself/page/1/',
 '/tag/inspirational/page/1/',
 '/author/Albert-Einstein',
 '/tag/adulthood/page/1/',
 '/tag/success/page/1/',
 '/tag/value/page/1/',
 '/author/Andre-Gide',
 '/tag/life/page/1/',
 '/tag/love/page/1/',
 '/author/Thomas-A-Edison',
 '/tag/edison/page/1/',
 '/tag/failure/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/paraphrased/page/1/',
 '/author/Eleanor-Roosevelt',
 '/tag/misattributed-eleanor-roosevelt/page/1/',
 '/author/Steve-Martin',
 

In [47]:
dlink= soup.find('div',attrs={'class':'container'})
links = dlink.find_all('a')
actual_links = [link['href']for link in links]
actual_links

['/',
 '/login',
 '/author/Albert-Einstein',
 '/tag/change/page/1/',
 '/tag/deep-thoughts/page/1/',
 '/tag/thinking/page/1/',
 '/tag/world/page/1/',
 '/author/J-K-Rowling',
 '/tag/abilities/page/1/',
 '/tag/choices/page/1/',
 '/author/Albert-Einstein',
 '/tag/inspirational/page/1/',
 '/tag/life/page/1/',
 '/tag/live/page/1/',
 '/tag/miracle/page/1/',
 '/tag/miracles/page/1/',
 '/author/Jane-Austen',
 '/tag/aliteracy/page/1/',
 '/tag/books/page/1/',
 '/tag/classic/page/1/',
 '/tag/humor/page/1/',
 '/author/Marilyn-Monroe',
 '/tag/be-yourself/page/1/',
 '/tag/inspirational/page/1/',
 '/author/Albert-Einstein',
 '/tag/adulthood/page/1/',
 '/tag/success/page/1/',
 '/tag/value/page/1/',
 '/author/Andre-Gide',
 '/tag/life/page/1/',
 '/tag/love/page/1/',
 '/author/Thomas-A-Edison',
 '/tag/edison/page/1/',
 '/tag/failure/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/paraphrased/page/1/',
 '/author/Eleanor-Roosevelt',
 '/tag/misattributed-eleanor-roosevelt/page/1/',
 '/author/Steve-Martin',
 