# ***Web Scraping using BeautifulSoup-Python (STEP 1)***

Web Scraping to collect Data from Books.toscrape.com Website which will then be used to for making ML Model and doing EDA , for story telling using Power BI.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
books = []

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

for page in range(1, 11):  # scrape all 10 pages
    url = base_url.format(page)
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        response.encoding = 'utf-8'
    except Exception as e:
        print(f"❌ Failed to fetch page {page}: {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_='product_pod')

    for book in articles:
        title = book.h3.a['title']

        price_raw = book.find('p', class_='price_color').text.strip()
        price_clean = ''.join(ch for ch in price_raw if ch.isdigit() or ch == '.')
        price = float(price_clean)

        availability = book.find('p', class_='instock availability').text.strip()
        rating = book.p['class'][1]
        link = "https://books.toscrape.com/catalogue/" + book.h3.a['href']

        books.append({
            'title': title,
            'price (£)': price,
            'availability': availability,
            'rating': rating,
            'link': link
        })

    print(f"✅ Page {page} scraped")
    time.sleep(1.5)  # reduce load on server

df = pd.DataFrame(books)
df.to_csv("cleaned_books.csv", index=False)
print("✅ Done. All data saved to cleaned_books.csv")

✅ Page 1 scraped
✅ Page 2 scraped
✅ Page 3 scraped
✅ Page 4 scraped
✅ Page 5 scraped
✅ Page 6 scraped
✅ Page 7 scraped
✅ Page 8 scraped
✅ Page 9 scraped
✅ Page 10 scraped
✅ Done. All data saved to cleaned_books.csv
