In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
headers = {"User-Agent": "Mozilla/5.0"}

all_books = []

for page in range(1, 6):  # First 5 pages
    url = base_url.format(page)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        title = book.h3.a["title"]

        raw_price = book.find("p", class_="price_color").get_text()
        price = re.sub(r"[^\d.]", "", raw_price)  # Remove non-numeric characters

        availability = book.find("p", class_="instock availability").get_text(strip=True)
        rating = book.p["class"][1]  # Get rating from class attribute (e.g., 'One', 'Two', ...)

        all_books.append({
            "title": title,
            "price_gbp": float(price),
            "availability": availability,
            "rating": rating
        })

# Convert to DataFrame
df = pd.DataFrame(all_books)

# Save as CSV
df.to_csv("books.csv", index=False)
print("✅ Done! Saved as books.csv")

# Show first rows
df.head()


✅ Done! Saved as books.csv


Unnamed: 0,title,price_gbp,availability,rating
0,A Light in the Attic,51.77,In stock,Three
1,Tipping the Velvet,53.74,In stock,One
2,Soumission,50.1,In stock,One
3,Sharp Objects,47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,54.23,In stock,Five
