# Scraping Quotes 
## Test the scraping script part by part

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# URL of the website to scrape
url_base = "http://quotes.toscrape.com/"

In [3]:
# List to store scraped data
list_quote = []

In [4]:
# Test response
response = requests.get(url_base)
response.status_code

200

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="

In [6]:
quotes = soup.find_all('div', class_='quote')
quotes

[<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
 <a class="tag" href="/tag/change/page/1/">change</a>
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>
 <a class="tag" href="/tag/world/page/1/">world</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
 <span>by <small class="author" itempr

In [8]:
test_list = []
for quote in quotes:
    text = quote.find('span', class_='text').get_text()
    author = quote.find('small', class_='author').get_text()
    # for tag in quote.find_all('a', class_='tag'):
    #     print(tag.get_text())
    tags = [tag.get_text() for tag in quote.find_all('a', class_='tag')]
    # print(text)
    # print(author)
    # print(tags)

    test_list.append({
        'quote': text,
        'author': author,
        'tags': ', '.join(tags)
    })

In [9]:
df_test = pd.DataFrame(test_list)
df_test.head()

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"change, deep-thoughts, thinking, world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities, choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy, books, classic, humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself, inspirational"


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   quote   10 non-null     object
 1   author  10 non-null     object
 2   tags    10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes


In [11]:
import re

In [12]:
quote_cleaned = [re.sub(r'[“”.,]', '', quote) for quote in df_test['quote']]

In [13]:
quote_cleaned

['The world as we have created it is a process of our thinking It cannot be changed without changing our thinking',
 'It is our choices Harry that show what we truly are far more than our abilities',
 'There are only two ways to live your life One is as though nothing is a miracle The other is as though everything is a miracle',
 'The person be it gentleman or lady who has not pleasure in a good novel must be intolerably stupid',
 "Imperfection is beauty madness is genius and it's better to be absolutely ridiculous than absolutely boring",
 'Try not to become a man of success Rather become a man of value',
 'It is better to be hated for what you are than to be loved for what you are not',
 "I have not failed I've just found 10000 ways that won't work",
 "A woman is like a tea bag; you never know how strong it is until it's in hot water",
 'A day without sunshine is like you know night']

In [14]:
df_test['quote_cleaned'] = quote_cleaned

In [15]:
df_test.head()

Unnamed: 0,quote,author,tags,quote_cleaned
0,“The world as we have created it is a process ...,Albert Einstein,"change, deep-thoughts, thinking, world",The world as we have created it is a process o...
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities, choices",It is our choices Harry that show what we trul...
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles",There are only two ways to live your life One ...
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy, books, classic, humor",The person be it gentleman or lady who has not...
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself, inspirational",Imperfection is beauty madness is genius and i...


In [16]:
test_next_page = soup.find('li', class_='next')
test_next_page

<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
</li>

In [17]:
test_next_page.find('a')['href']

'/page/2/'

In [42]:
url_new = url_base + test_next_page.find('a')['href']
url_new

'http://quotes.toscrape.com//page/2/'

In [80]:
bool(re.search('/page.*', url_base))

False

In [81]:
bool(re.search('/page.*', url_new))

True

In [82]:
re.sub('/page.*', '/page/3/', url_new)

'http://quotes.toscrape.com//page/3/'

In [45]:
requests.get(url_base + test_next_page.find('a')['href']).status_code

200

### Test the scraping and cleaning script

In [76]:
from data_scraping import data_scraper

In [77]:
data_scraper

<function data_scraping.data_scraper(url)>

In [78]:
url_base

'http://quotes.toscrape.com/'

In [79]:
# Scrape the data
data_scraper(url_base)

=== Get response from the web. ===
=== Status code 200. ===
=== Start scraping. ===
=== Done scraping. ===


NameError: name 'url_base' is not defined

In [83]:
# Set a function to scrape the data from the URL above
def scrape_quote(url):
    print("=== Get response from the web ===")
    response = requests.get(url)
    if response.status_code == 200:
        print("=== Status code 200 ===")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Retrieve quotes
        quotes = soup.find_all('div', class_='quote')
        print("=== Start scraping ===")
        for quote in quotes:
            text = quote.find('span', class_='text').get_text()
            author = quote.find('small', class_='author').get_text()
            tags = [tag.get_text() for tag in quote.find_all('a', class_='tag')]
            list_quote.append({
                'quote': text,
                'author': author,
                'tags': ', '.join(tags)
            })
        print("=== Done scraping ===")

        # Find the next page link
        next_page = soup.find('li', class_='next')
        if next_page:
            page = next_page.find('a')['href']

            if bool(re.search('/page.*', url)):
                next_url = re.sub('/page.*', page, url)
            
            else:
                next_url = url + page

            scrape_quote(next_url)
        
        else:
            print("=== All pages have been covered. ===")

        
        # if next_page:
        #     next_url = url_base + next_page.find('a')['href']
        #     print("=== Move to the next page ===")
        #     scrape_quote(next_url)

    else:
        print(f"=== Failed to retrieve page: {url} ===")

In [84]:
# Start scraping by running the function
scrape_quote(url_base)

=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scraping ===
=== Done scraping ===
=== Get response from the web ===
=== Status code 200 ===
=== Start scrap

In [24]:
# Save the data into a dataframe
df = pd.DataFrame(list_quote)

In [25]:
df.head()

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"change, deep-thoughts, thinking, world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities, choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy, books, classic, humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself, inspirational"
