In [2]:
#import libraries
import pandas as pd
import requests
import sqlite3
import re

from bs4        import BeautifulSoup
from datetime   import datetime
from sqlalchemy import create_engine

## Classics

In [24]:
#API requests
url = 'http://books.toscrape.com/catalogue/category/books/classics_6'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url, headers=headers)

#Beautiful Soup Objects
soup = BeautifulSoup(page.text, 'html.parser')

books_classic = soup.find('ol', class_='row')

books_classic_list = books_classic.find_all('article', class_='product_pod')
#books_classic_list[0]


# ============= Scraping Name, Price, Availability ============== #

classic_attributes = [list(filter(None, p.get_text().split('\n'))) for p in books_classic_list]

book_classic_table = pd.DataFrame(classic_attributes)

book_classic_table.columns = ['Name', 'Price', 'del1', 'Availability', 'del2', 'del3']
book_classic_table = book_classic_table.drop(['del1', 'del2', 'del3'], axis=1)
#regex = '(\d+\.\d+)'
#book_classic_table['Price'] = book_classic_table['Price'].apply(lambda x: re.search(regex, x).group(1))
book_classic_table['Price'] = book_classic_table['Price'].apply(lambda x: x[2:])

# ============= Scraping Star-Rating ================== #

classic_rating = [list(p.find('p', class_='star-rating').get('class')) for p in books_classic_list]
classic_rating = pd.DataFrame(classic_rating, columns=['del1', 'Star Rating'])
classic_rating = classic_rating.drop('del1', axis=1)

# ============= Table Concat ============== #

book_classic_table = pd.concat([book_classic_table, classic_rating], axis=1)

# ============= Insert Datetime Scrapy Column ============== #
book_classic_table['Scrapy Datetime'] = datetime.now().strftime('%Y-%m-%d- %H:%M:%S')
book_classic_table['Catalog'] = 'Classics'

book_classic_table

Unnamed: 0,Name,Price,Availability,Star Rating,Scrapy Datetime,Catalog
0,The Secret Garden,15.08,In stock,Four,2021-07-07- 15:21:00,Classics
1,The Metamorphosis,28.58,In stock,One,2021-07-07- 15:21:00,Classics
2,The Pilgrim's Progress,50.26,In stock,Two,2021-07-07- 15:21:00,Classics
3,The Hound of the ...,14.82,In stock,Two,2021-07-07- 15:21:00,Classics
4,Little Women (Little Women ...,28.07,In stock,Four,2021-07-07- 15:21:00,Classics
5,Gone with the Wind,32.49,In stock,Three,2021-07-07- 15:21:00,Classics
6,Candide,58.63,In stock,Three,2021-07-07- 15:21:00,Classics
7,Animal Farm,57.22,In stock,Three,2021-07-07- 15:21:00,Classics
8,Wuthering Heights,17.73,In stock,Three,2021-07-07- 15:21:00,Classics
9,The Picture of Dorian ...,29.7,In stock,Two,2021-07-07- 15:21:00,Classics


## Science Fiction

In [27]:
#API Requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

url = 'https://books.toscrape.com/catalogue/category/books/science-fiction_16'

page = requests.get(url, headers=headers)

#Beautiful Soup Objects
soup = BeautifulSoup(page.text, 'html.parser')

books_sf = soup.find('ol', class_='row')

books_sf_list = books_sf.find_all('article', class_='product_pod')
#books_sf_list[0]

# ============= Scraping Name, Price, Availability ============== #

sf_attributes = [list(filter(None, p.get_text().split('\n'))) for p in books_sf_list]

book_sf_table = pd.DataFrame(sf_attributes)
book_sf_table.columns = ['Name', 'Price', 'del1', 'Availability', 'del2', 'del3']
book_sf_table = book_sf_table.drop(['del1', 'del2', 'del3'], axis=1)
book_sf_table['Price'] = book_sf_table['Price'].apply(lambda x: x[2:])
#book_sf_table

# ============= Scraping Star-Rating ================== #

sf_rating = [list(p.find('p', class_='star-rating').get('class')) for p in books_sf_list]
sf_rating = pd.DataFrame(sf_rating, columns=['del1', 'Star Rating'])
sf_rating = sf_rating.drop('del1', axis=1)

# ============= Table Concat ============== #

book_sf_table = pd.concat([book_sf_table, sf_rating], axis=1)

# ============= Insert Datetime Scrapy Column ============== #
book_sf_table['Scrapy Datetime'] = datetime.now().strftime('%Y-%m-%d- %H:%M:%S')
book_sf_table['Catalog'] = 'Science Fiction'

book_sf_table

Unnamed: 0,Name,Price,Availability,Star Rating,Scrapy Datetime,Catalog
0,Mesaerion: The Best Science ...,37.59,In stock,One,2021-07-07- 15:24:50,Science Fiction
1,Join,35.67,In stock,Five,2021-07-07- 15:24:50,Science Fiction
2,William Shakespeare's Star Wars: ...,43.3,In stock,Four,2021-07-07- 15:24:50,Science Fiction
3,The Project,10.65,In stock,One,2021-07-07- 15:24:50,Science Fiction
4,Soft Apocalypse,26.12,In stock,Two,2021-07-07- 15:24:50,Science Fiction
5,Sleeping Giants (Themis Files ...,48.74,In stock,One,2021-07-07- 15:24:50,Science Fiction
6,Arena,21.36,In stock,Four,2021-07-07- 15:24:50,Science Fiction
7,Foundation (Foundation (Publication Order) ...,32.42,In stock,One,2021-07-07- 15:24:50,Science Fiction
8,The Restaurant at the ...,10.92,In stock,One,2021-07-07- 15:24:50,Science Fiction
9,Ready Player One,19.07,In stock,Four,2021-07-07- 15:24:50,Science Fiction


## Humor

In [28]:
#API Requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

url = 'https://books.toscrape.com/catalogue/category/books/humor_30'

page = requests.get(url, headers=headers)

#Beautiful Soup Objects
soup = BeautifulSoup(page.text, 'html.parser')

books_humor = soup.find('ol', class_='row')

books_humor_list = books_humor.find_all('article', class_='product_pod')
#books_sf_list[0]

# ============= Scraping Name, Price, Availability ============== #

humor_attributes = [list(filter(None, p.get_text().split('\n'))) for p in books_humor_list]

book_humor_table = pd.DataFrame(humor_attributes)
book_humor_table.columns = ['Name', 'Price', 'del1', 'Availability', 'del2', 'del3']
book_humor_table = book_humor_table.drop(['del1', 'del2', 'del3'], axis=1)
book_humor_table['Price'] = book_humor_table['Price'].apply(lambda x: x[2:])
#book_humor_table

# ============= Scraping Star-Rating ================== #

humor_rating = [list(p.find('p', class_='star-rating').get('class')) for p in books_humor_list]
humor_rating = pd.DataFrame(humor_rating, columns=['del1', 'Star Rating'])
humor_rating = humor_rating.drop('del1', axis=1)

# ============= Table Concat ============== #

book_humor_table = pd.concat([book_humor_table, humor_rating], axis=1)

# ============= Insert Datetime Scrapy Column ============== #
book_humor_table['Scrapy Datetime'] = datetime.now().strftime('%Y-%m-%d- %H:%M:%S')
book_humor_table['Catalog'] = 'Humor'

book_humor_table

Unnamed: 0,Name,Price,Availability,Star Rating,Scrapy Datetime,Catalog
0,The Long Haul (Diary ...,44.07,In stock,One,2021-07-07- 15:24:55,Humor
1,Old School (Diary of ...,11.83,In stock,Five,2021-07-07- 15:24:55,Humor
2,I Know What I'm ...,25.98,In stock,Four,2021-07-07- 15:24:55,Humor
3,Hyperbole and a Half: ...,14.75,In stock,Five,2021-07-07- 15:24:55,Humor
4,Dress Your Family in ...,43.68,In stock,Three,2021-07-07- 15:24:55,Humor
5,Toddlers Are A**holes: It's ...,25.55,In stock,One,2021-07-07- 15:24:55,Humor
6,When You Are Engulfed ...,30.89,In stock,Five,2021-07-07- 15:24:55,Humor
7,Naked,31.69,In stock,Three,2021-07-07- 15:24:55,Humor
8,Lamb: The Gospel According ...,55.5,In stock,Five,2021-07-07- 15:24:55,Humor
9,Holidays on Ice,51.07,In stock,Two,2021-07-07- 15:24:55,Humor


## Business

In [29]:
#API Requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

url = 'https://books.toscrape.com/catalogue/category/books/business_35'

page = requests.get(url, headers=headers)

#Beautiful Soup Objects
soup = BeautifulSoup(page.text, 'html.parser')

books_business = soup.find('ol', class_='row')

books_business_list = books_business.find_all('article', class_='product_pod')
books_business_list[0]

# ============= Scraping Name, Price, Availability ============== #

business_attributes = [list(filter(None, p.get_text().split('\n'))) for p in books_business_list]

book_business_table = pd.DataFrame(business_attributes)
book_business_table.columns = ['Name', 'Price', 'del1', 'Availability', 'del2', 'del3']
book_business_table = book_business_table.drop(['del1', 'del2', 'del3'], axis=1)
book_business_table['Price'] = book_business_table['Price'].apply(lambda x: x[2:])
#book_humor_table

# ============= Scraping Star-Rating ================== #

business_rating = [list(p.find('p', class_='star-rating').get('class')) for p in books_business_list]
business_rating = pd.DataFrame(business_rating, columns=['del1', 'Star Rating'])
business_rating = business_rating.drop('del1', axis=1)

# ============= Table Concat ============== #

book_business_table = pd.concat([book_business_table, business_rating], axis=1)

# ============= Insert Datetime Scrapy Column ============== #
book_business_table['Scrapy Datetime'] = datetime.now().strftime('%Y-%m-%d- %H:%M:%S')
book_business_table['Catalog'] = 'Business'

book_business_table

Unnamed: 0,Name,Price,Availability,Star Rating,Scrapy Datetime,Catalog
0,The Dirty Little Secrets ...,33.34,In stock,Four,2021-07-07- 15:25:03,Business
1,The Third Wave: An ...,12.61,In stock,Five,2021-07-07- 15:25:03,Business
2,The 10% Entrepreneur: Live ...,27.55,In stock,Three,2021-07-07- 15:25:03,Business
3,Shoe Dog: A Memoir ...,23.99,In stock,Two,2021-07-07- 15:25:03,Business
4,Made to Stick: Why ...,38.85,In stock,Five,2021-07-07- 15:25:03,Business
5,Quench Your Own Thirst: ...,43.14,In stock,One,2021-07-07- 15:25:03,Business
6,The Art of Startup ...,21.0,In stock,Three,2021-07-07- 15:25:03,Business
7,Born for This: How ...,21.59,In stock,Five,2021-07-07- 15:25:03,Business
8,The E-Myth Revisited: Why ...,36.91,In stock,One,2021-07-07- 15:25:03,Business
9,"Rich Dad, Poor Dad",51.74,In stock,One,2021-07-07- 15:25:03,Business


## Complete Catalog

In [83]:
book_catalog = pd.concat([book_classic_table, book_sf_table, book_humor_table, book_business_table], axis=0)
book_catalog.columns = ['name', 'price', 'availability', 'rating', 'datetime', 'catalog']
book_catalog.head()

Unnamed: 0,name,price,availability,rating,datetime,catalog
0,The Secret Garden,15.08,In stock,Four,2021-07-07- 15:21:00,Classics
1,The Metamorphosis,28.58,In stock,One,2021-07-07- 15:21:00,Classics
2,The Pilgrim's Progress,50.26,In stock,Two,2021-07-07- 15:21:00,Classics
3,The Hound of the ...,14.82,In stock,Two,2021-07-07- 15:21:00,Classics
4,Little Women (Little Women ...,28.07,In stock,Four,2021-07-07- 15:21:00,Classics


In [58]:
book_catalog.to_csv('book catalog')

In [84]:
query_books_schema = """
    CREATE TABLE books (
        name             TEXT,
        price            REAL,
        availability     TEXT,
        rating           TEXT,
        datetime         TEXT,
        catalog          TEXT    
    )
"""

In [87]:
conn = sqlite3.connect('book_catalog.sqlite')
cursor = conn.execute(query_books_schema)
conn.commit()
conn.close()

In [88]:
conn = create_engine('sqlite:///book_catalog.sqlite', echo=False)

In [89]:
book_catalog.to_sql('books', con=conn, if_exists='append', index=False)

In [94]:
query = """
    SELECT * FROM books WHERE catalog = 'Humor'
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,name,price,availability,rating,datetime,catalog
0,The Long Haul (Diary ...,44.07,In stock,One,2021-07-07- 15:24:55,Humor
1,Old School (Diary of ...,11.83,In stock,Five,2021-07-07- 15:24:55,Humor
2,I Know What I'm ...,25.98,In stock,Four,2021-07-07- 15:24:55,Humor
3,Hyperbole and a Half: ...,14.75,In stock,Five,2021-07-07- 15:24:55,Humor
4,Dress Your Family in ...,43.68,In stock,Three,2021-07-07- 15:24:55,Humor
5,Toddlers Are A**holes: It's ...,25.55,In stock,One,2021-07-07- 15:24:55,Humor
6,When You Are Engulfed ...,30.89,In stock,Five,2021-07-07- 15:24:55,Humor
7,Naked,31.69,In stock,Three,2021-07-07- 15:24:55,Humor
8,Lamb: The Gospel According ...,55.5,In stock,Five,2021-07-07- 15:24:55,Humor
9,Holidays on Ice,51.07,In stock,Two,2021-07-07- 15:24:55,Humor
