In [None]:
# Pandas
    # tool for handling data in python (organizing & manipulating)
    # library that provides a high level interface

In [5]:
import requests
from bs4 import BeautifulSoup

response=requests.get('https://books.toscrape.com/')
soup=BeautifulSoup(response.content)

book_tags=soup.find_all('article',attrs={'class':'product_pod'})

def clean_price(price):
    return float(''.join([char for char in price if char.isdigit() or char=='.']))

def map_rating(rating):
    rating_map={
        'One':1,
        'Two':2,
        'Three':3,
        'Four':4,
        'Five':5
    }
    return rating_map[rating]

def extract_book_data(book_tag):
    title=book_tag.find('h3').find('a')['title']
    price=book_tag.find('p',attrs={'class':'price_color'}).get_text()
    rating=book_tag.find('p',attrs={'class':'star-rating'})['class'][1]
    return {
        'title':title,
        'price':clean_price(price),
        'rating':map_rating(rating),
    }

book_tags=soup.find_all('article',attrs={'class':'product_pod'})

book_data=[extract_book_data(book_tag) for book_tag in book_tags]


In [6]:
book_data

[{'title': 'A Light in the Attic', 'price': 51.77, 'rating': 3},
 {'title': 'Tipping the Velvet', 'price': 53.74, 'rating': 1},
 {'title': 'Soumission', 'price': 50.1, 'rating': 1},
 {'title': 'Sharp Objects', 'price': 47.82, 'rating': 4},
 {'title': 'Sapiens: A Brief History of Humankind',
  'price': 54.23,
  'rating': 5},
 {'title': 'The Requiem Red', 'price': 22.65, 'rating': 1},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'price': 33.34,
  'rating': 4},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'price': 17.93,
  'rating': 3},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'price': 22.6,
  'rating': 4},
 {'title': 'The Black Maria', 'price': 52.15, 'rating': 1},
 {'title': 'Starving Hearts (Triangular Trade Trilogy, #1)',
  'price': 13.99,
  'rating': 2},
 {'title': "Shakespeare's Sonnets", 'price': 20.66, 'rating': 4},
 {'title': 'Set 

In [None]:
# currently our data is in a Python List of Dicitionaries
# we can traverse it using basic python iteration

In [8]:
# find average price of all books
sum([book['price'] for book in book_data])/len(book_data)

38.048500000000004

In [9]:
# find book titles with price < 20
[book['title'] for book in book_data if book['price']<20]

['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 'Set Me Free']

In [None]:
# this is a bit lengthy & have to write lot of code
# as size of data grows this will become inefficient
# Pandas: use when working with data in Python
# install in terminal: pip install pandas==1.5.3

In [10]:
import pandas as pd

In [11]:
# from our list of dictionaries we can create a pandas dataframe (2D data-structure)
df=pd.DataFrame(book_data)

In [12]:
df
# rows are individual books
# columns are book attributes

# this DataFrame has lots of functionality that makes it easier to work with

Unnamed: 0,title,price,rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [13]:
# find average of all books
df.price.mean()

38.048500000000004

In [14]:
# find book titles with price < 20
df.price<20
# this produces a boolean mask, for each row we get a boolean indicating if price < 20 or not

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8     False
9     False
10     True
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: price, dtype: bool

In [15]:
df[df.price<20]

Unnamed: 0,title,price,rating
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
10,"Starving Hearts (Triangular Trade Trilogy, #1)",13.99,2
12,Set Me Free,17.46,5


In [16]:
df[df.price<20].title

7     The Coming Woman: A Novel Based on the Life of...
10       Starving Hearts (Triangular Trade Trilogy, #1)
12                                          Set Me Free
Name: title, dtype: object

In [None]:
# with pandas
# we can store/export data in different formats
# csv, excel, html, json, etc

In [18]:
df.to_csv('books.csv',index=False)
# stored in files

In [21]:
df.to_json('books.csv',orient='records')
