In [1]:
!pip install requests
!pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [3]:
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

titles = []
prices = []
availability = []
ratings = []

for page in range(1, 6):
    url = base_url.format(page)
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        titles.append(book.h3.a['title'])
        prices.append(book.find('p', class_='price_color').text)
        availability.append(book.find('p', class_='instock availability').text.strip())
        rating_class = book.p.get('class')[1]
        ratings.append(rating_class)

df = pd.DataFrame({
    'title': titles,
    'price': prices,
    'availability': availability,
    'rating': ratings
})

df.head()


Unnamed: 0,title,price,availability,rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [4]:
df.to_csv('raw_books_data.csv', index=False)


In [5]:
raw_df = pd.read_csv('raw_books_data.csv', encoding='utf-8-sig')
raw_df.head()


Unnamed: 0,title,price,availability,rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [6]:
# Remove the £ symbol and convert to float
raw_df['price'] = raw_df['price'].replace('[^0-9.]', '', regex=True).astype(float)

# Convert rating words (One, Two, etc.) to numbers (1, 2, etc.)
rating_map = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}
raw_df['rating'] = raw_df['rating'].map(rating_map)

# Make all titles lowercase and consistent
raw_df['title'] = raw_df['title'].str.lower()

# Convert to True if “In stock”, otherwise False
raw_df['availability'] = raw_df['availability'].apply(lambda x: 'in stock' in x.lower())

# Remove duplicates and missing values
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.dropna()

# Use snake_case like book_title, price, available, rating
raw_df = raw_df.rename(columns={
    'title': 'book_title',
    'price': 'price',
    'availability': 'available',
    'rating': 'rating'
})

raw_df.head()


Unnamed: 0,book_title,price,available,rating
0,a light in the attic,51.77,True,3
1,tipping the velvet,53.74,True,1
2,soumission,50.1,True,1
3,sharp objects,47.82,True,4
4,sapiens: a brief history of humankind,54.23,True,5
