<a href="https://colab.research.google.com/github/shreenidhikamath/book_scraping/blob/main/bookscraping_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

In [None]:
#this is the base url
BASE_URL = 'http://books.toscrape.com/catalogue/'
#Here we extract only the text content of the first page of website
START_URL = 'http://books.toscrape.com/catalogue/page-1.html'

In [None]:
#this is a function written to obtain the rating(numerical value) from the categorical value i.e 1 from 'One' for example
#here a dictionary rating is taken to show the key value pairs of numerical and categorical values
def rating(star_class):
    ratings = {"One": 1,"Two": 2,"Three": 3,"Four": 4,"Five": 5}
    for i,j in ratings.items():
        if i in star_class:
            return j
    return None

In [None]:
# this is the most important function in the program which helps to obtain information of every book
def scrape():
    # a list books is taken to put the necessary attributes as user requirements
    books = []
    # we begin with the first page of the base url
    page_number = 1
    # A loop is designed to find information of all 1000 books as required in the user story
    while len(books) < 1000:
        url = f"{BASE_URL}page-{page_number}.html"
        #to obtain response from the website we can use requests.get from the request module
        #sometimes some websites do not provide scraping oppurtunity to keep away from bot access so an additional headers line will be required but here it works without it.
        response = requests.get(url)
        #if there is no response from the website then it provides status codes from 400 generally if the website is ok then 200 is given
        if response.status_code != 200:
            print(f"Failed to retrieve page {page_number}")
            break
        #now let us extract the html type content of webpage
        soup = BeautifulSoup(response.text, 'html.parser')
        #under the article section inside the class product_pod we have the necessary book information to extract so let us navigate to it using soup.select
        book_list = soup.select('article.product_pod')
        #if there are no more books means we have come out from the class product_pod of article section so we will print no more books and come out of loop.
        if not book_list:
            print("No more books found.")
            break
        #now we navigate through items of the list and add each item
        for book in book_list:
            #the book title is a link and a h3 type and it is present inside title section
            title = book.h3.a['title']
            #select_one a function of soup returns the first matching tag as a tag object or returns none if not found
            #here price is a paragraph type and is present in price_color class so only the text part is extracted and not the currency symbol using text.strip()
            price = book.select_one('p.price_color').text.strip()
            #here the rating is accessed by star-rating class
            rating_class = book.select_one('p.star-rating')['class']
            #function rating is called
            rate = rating(rating_class)
            #availability is accessed inside instock availability class and only the text is extracted
            availability = book.select_one('p.instock.availability').text.strip()
            #the product url is a h3 type but there are various h3 elements so h3.a refers to all link elements and we require the href portion of it
            product_relative_url = book.h3.a['href']
            #each product_url are partially present so it must be joined with the base url which is done using urljoin fuction
            product_url = urljoin(BASE_URL, product_relative_url)
            #now all the attributes are appended to the books list as a key-pair value where the keys represent the name of attributes and value represents their actual value
            books.append({
                'Title': title,
                'Price': price,
                'Rating': rate,
                'Availability': 'In stock' if 'In stock' in availability else 'Out of stock',
                'Product URL': product_url
            })
        #we navigate every page so page number is increemented
        page_number +=1

    return books

In [None]:
# Scrape and save to CSV
book_data = scrape()
df = pd.DataFrame(book_data)
#remove the index assigned by to_csv operation by default
df.to_csv('books_data.csv', index=False)

In [None]:
import requests

def fetch_page(url):
    try:
        response = requests.get(url, timeout=10)  #sends a GET request to the given URL
        response.raise_for_status()               #raises error if the response has a bad status (like 404)
        return response                           #returns the response object if everything's okay
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")        #prints the error if request fails
        return None                                #returns None if there was any issue

In [None]:
import logging

#sets up logging to save errors in a file instead of stopping the script
logging.basicConfig(filename="scraping_errors.log", level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s")

def process_books(book_list, books):
    for book in book_list:  #go through each book in the list
        try:
            #get the main parts of the book info
            title_tag = book.h3.a
            price_tag = book.select_one('p.price_color')
            rating_tag = book.select_one('p.star-rating')
            availability_tag = book.select_one('p.instock.availability')
            product_url_tag = book.h3.a

            #check if any important data is missing
            if None in [title_tag, price_tag, rating_tag, availability_tag, product_url_tag]:
                raise ValueError("Missing Data")  #throw error if something's missing

            #clean and store the data properly
            title = title_tag['title']
            price = price_tag.text.strip().replace("Â£", "").replace("£", "")
            rating_class = rating_tag['class'][1]  #gets the rating word like 'Three'
            availability = availability_tag.text.strip()
            product_url = product_url_tag['href']

            #add all the data to the books list
            books.append({
                'Title': title,
                'Price': float(price),
                'Rating': rating_class,
                'Availability': 'In stock' if 'In stock' in availability else 'Out of stock',
                'Product URL': product_url
            })
        except Exception as e:
            logging.error(f"Skipping book due to missing data: {e}")  #log error if something goes wrong


In [None]:
def scrape():
    #start with an empty list to store the book details
    books = []
    page_number = 1  #begin from the first page of the catalog

    #keep going until we collect about 1000 books
    while len(books) < 1000:
        #construct the URL for the current page
        url = f"http://books.toscrape.com/catalogue/page-{page_number}.html"

        #try to fetch the page content using a safe method
        response = fetch_page(url)

        #if the page couldn't be loaded, stop the scraping process
        if response is None:
            break

        #parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        #find all the book entries on this page
        book_list = soup.select('article.product_pod')

        #if there are no books on the page, we've likely reached the end
        if not book_list:
            break

        #extract the details we care about from each book and add them to our list
        process_books(book_list, books)

        #move on to the next page
        page_number += 1

    #once done, return the complete list of books we've gathered
    return books


In [None]:
#turn the list of book data into a DataFrame for easier handling and analysis
df = pd.DataFrame(scrape())

#check if we actually got any data
if not df.empty:
    #save the data to a CSV file, without the row index
    df.to_csv('books_data.csv', index=False)
    print("Data saved successfully.")
else:
    #let the user know that something went wrong or no data was found
    print("No data found. Check logs for errors.")


Data saved successfully.


In [None]:
import os
print("books_data.csv exists:", os.path.isfile("books_data.csv"))  #check if the CSV file is already there

books_data.csv exists: True


In [None]:
import pandas as pd
df = pd.read_csv("books_data.csv")
print(df.head())  #show the first few rows to see what’s inside

                                   Title  Price Rating Availability  \
0                   A Light in the Attic  51.77  Three     In stock   
1                     Tipping the Velvet  53.74    One     In stock   
2                             Soumission  50.10    One     In stock   
3                          Sharp Objects  47.82   Four     In stock   
4  Sapiens: A Brief History of Humankind  54.23   Five     In stock   

                                         Product URL  
0               a-light-in-the-attic_1000/index.html  
1                  tipping-the-velvet_999/index.html  
2                          soumission_998/index.html  
3                       sharp-objects_997/index.html  
4  sapiens-a-brief-history-of-humankind_996/index...  


In [None]:
print("File extension:", os.path.splitext("books_data.csv")[1])  #just checking the file type here
print(df["Price"].dtype)  #see what kind of data type the Price column has

File extension: .csv
float64


In [None]:
expected_columns = ["Title", "Price", "Rating", "Availability", "Product URL"]
print(df.columns.tolist() == expected_columns)  #check if the columns match what we expect

True


In [None]:
print(df.isnull().sum())  #see if any data is missing in the columns

Title           0
Price           0
Rating          0
Availability    0
Product URL     0
dtype: int64


In [None]:
#see if the log file is there before opening it
if os.path.isfile("scraping_errors.log"):
    with open("scraping_errors.log", "r") as file:
        errors = file.readlines()
        print("Errors logged:", len(errors) > 0)  #true if any errors got saved
else:
    print("Errors logged: False (log file not found, so no errors logged)")

Errors logged: False (log file not found, so no errors logged)
