In [8]:
# Import required libraries for scraping and data processing

import requests  # For making HTTP requests to web pages

from bs4 import BeautifulSoup  # For parsing HTML content

import pandas as pd  # For handling tabular data and saving to CSV

from urllib.parse import urljoin  # For constructing absolute URLs from relative links


In [9]:
# Base URL of the website to scrape
BASE_URL = 'https://books.toscrape.com/'  # The homepage of the website from which categories and books will be scraped

# ---------- 1. Scrape Categories ----------
# Send a GET request to the base URL and parse the HTML content
response = requests.get(BASE_URL)  # Fetch the HTML content of the homepage
soup = BeautifulSoup(response.content, 'html.parser')  # Use BeautifulSoup to parse the fetched HTML

categories = []  # Initialize an empty list to store category data
# Locate the section of the page that contains the list of categories
category_section = soup.find('ul', class_='nav nav-list')  # Find the navigation list containing categories
category_links = category_section.find_all('a')  # Extract all 'a' tags (links) from the category section

# Iterate through each category link (excluding the first which is 'Books')
for cat in category_links[1:]:  # Skip the first 'Books' category (which links to all books)
    cat_name = cat.text.strip()  # Extract and clean the category name text
    cat_link = urljoin(BASE_URL, cat['href'])  # Construct the full URL from the relative href
    categories.append({"Category": cat_name, "URL": cat_link})  # Add the category info as a dictionary to the list

In [10]:
# Save category data to CSV
categories_df = pd.DataFrame(categories)  # Convert the list of category dictionaries into a pandas DataFrame
categories_df.to_csv('categories.csv', index=False)  # Save the DataFrame to a CSV file without including the index column

In [12]:
#2. Scrape Books from the First Page ----------
book_list = []  # Initialize an empty list to store all scraped book data

# Find all book blocks on the homepage (each book is inside an <article> with class 'product_pod')
book_section = soup.find_all('article', class_='product_pod')

# Define a dictionary to convert rating words into numeric values
ratings = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}

# Loop through each book element and extract required fields
for book in book_section:
    title = book.h3.a['title']  # Extract the book title from the <a> tag inside <h3>
    price = book.find('p', class_='price_color').text.strip()[1:]  # Extract price and remove the pound symbol (£)
    stock = book.find('p', class_='instock availability').text.strip()  # Extract stock availability text
    rating_class = book.p['class'][1]  # Get the CSS class that represents the rating as a word (e.g., 'Three')
    rating = ratings.get(rating_class, 0)  # Convert the rating word into a numeric value using the dictionary

    # Get the relative URL of the book detail page and convert it to an absolute URL
    relative_link = book.h3.a['href']
    book_url = urljoin(BASE_URL, relative_link)

    # Visit the book's detail page to get the category using the breadcrumb navigation
    book_response = requests.get(book_url)
    book_soup = BeautifulSoup(book_response.content, 'html.parser')
    breadcrumb = book_soup.find('ul', class_='breadcrumb')
    breadcrumb_items = breadcrumb.find_all('li')
    category = breadcrumb_items[2].text.strip() if len(breadcrumb_items) > 2 else 'Unknown'  # Get the third item which is the category

    # Store all extracted information in a dictionary and add it to the book list
    book_list.append({
        'Name': title,
        'Category': category,
        'Price': float(price),
        'Stock Amount': stock,
        'Rating': rating
    })

In [13]:
# Save book data to CSV
books_df = pd.DataFrame(book_list)  # Convert the list of book dictionaries into a pandas DataFrame
books_df.to_csv('books.csv', index=False)  # Export the DataFrame to a CSV file without row indices

print("✅ Scraping complete. Files saved: categories.csv & books.csv")  # Confirm successful export of both CSV files

✅ Scraping complete. Files saved: categories.csv & books.csv
