In [39]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import json
import time
from urllib.parse import urljoin

In [40]:
# Constants
BASE_URL = "http://books.toscrape.com/"
CACHE_FILE = "books.json"
books = []


In [41]:
# Load cache
def load_cache():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            return json.load(f)
    return None

# Save cache
def save_cache(data):
    with open(CACHE_FILE, "w") as f:
        json.dump(data, f, indent=4)

In [42]:
# Fetch a single page and return parsed soup
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        print("Valid, proceeding with scraping:", url)
        return bs(response.text, "html.parser")
    else:
        print("Bad request:", url)
        return None

In [43]:
def scrape_page(soup):
    for book in soup.select(".product_pod"):
        title = book.h3.a['title']
        price = book.select_one(".price_color").text.strip()
        stock = book.select_one(".availability").text.strip()
        rating = book.p['class'][1]

        books.append({
            'title': title,
            'price': price,
            'stock': stock,
            'rating': rating
        })
# Loop through all pages
def scrape_all_pages(start_url):
    url = start_url
    while url:
        soup = fetch_page(url)
        if soup is None:
            break

        scrape_page(soup)

        next_button = soup.select_one("li.next > a")
        if next_button:
            url = urljoin(url, next_button['href'])
            time.sleep(1)  # Be nice to the server
        else:
            url = None

# Main scraper function
def scraper():
    scrape_all_pages(BASE_URL)
    return books


In [44]:
# Execution starts here
cached_data = load_cache()
if cached_data:
    print("Loaded from cache")
else:
    print("Scraping website...")
    cached_data = scraper()
    if cached_data:
        save_cache(cached_data)
        print("Data saved to cache.")
    else:
        print("No valid data found.")

Scraping website...
Valid, proceeding with scraping: http://books.toscrape.com/
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-2.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-3.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-4.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-5.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-6.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-7.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-8.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-9.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-10.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-11.html
Valid, proceeding with scraping: http://books.toscrape.com/catalogue/page-12.html
Valid, proceeding with scr

In [None]:
# Convert to DataFrame and print
if cached_data:
    df = pd.DataFrame(cached_data)
    print(df.head())

                                   title    price     stock rating
0                   A Light in the Attic  Â£51.77  In stock  Three
1                     Tipping the Velvet  Â£53.74  In stock    One
2                             Soumission  Â£50.10  In stock    One
3                          Sharp Objects  Â£47.82  In stock   Four
4  Sapiens: A Brief History of Humankind  Â£54.23  In stock   Five


In [46]:
df.head()

Unnamed: 0,title,price,stock,rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five
