In [3]:
!pip install playwright nest_asyncio
!playwright install chromium
!apt-get install libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 libxcomposite1


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libatk-bridge2.0-0 is already the newest version (2.38.0-3).
libatk1.0-0 is already the newest version (2.36.0-3build1).
libatspi2.0-0 is already the newest version (2.44.0-3).
libxcomposite1 is already the newest version (1:0.4.5-1build2).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [2]:
import nest_asyncio
nest_asyncio.apply()

import asyncio, json, csv, re
from pathlib import Path
from urllib.parse import urljoin
from playwright.async_api import async_playwright

BASE = "https://books.toscrape.com/"

RATING_MAP = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

def parse_price(s):
    if not s:
        return None
    m = re.search(r"([0-9]+(?:\.[0-9]+)?)", s)
    return float(m.group(1)) if m else None

def parse_stock(s):
    if not s:
        return 0
    m = re.search(r"(\d+)", s)
    return int(m.group(1)) if m else 0

async def scrape_all_books():
    rows = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context()
        page = await ctx.new_page()

        current_url = BASE
        page_num = 1
        book_counter = 0

        print("ðŸ”¥ Starting scraperâ€¦ going to homepage...\n")
        await page.goto(BASE, timeout=60000)

        while True:
            await page.wait_for_selector("article.product_pod", timeout=30000)

            print(f"ðŸ“„ Scraping page {page_num} ...")

            cards = await page.query_selector_all("article.product_pod")

            for card in cards:
                book_counter += 1

                a = await card.query_selector("h3 > a")
                title = await a.get_attribute("title")
                rel_url = await a.get_attribute("href")
                product_url = urljoin(current_url, rel_url)

                price_text = (await (await card.query_selector("p.price_color")).text_content()).strip()
                price = parse_price(price_text)
                avail_text = (await (await card.query_selector("p.instock.availability")).text_content()).strip()
                stock = parse_stock(avail_text)
                rating_class = await (await card.query_selector(".star-rating")).get_attribute("class")

                rating = 0
                for k in RATING_MAP:
                    if k in (rating_class or ""):
                        rating = RATING_MAP[k]
                        break

                # Scrape details
                detail = await ctx.new_page()
                try:
                    await detail.goto(product_url, timeout=60000)

                    prod_table = await detail.query_selector_all("table.table.table-striped tr")
                    prod_info = {}
                    for tr in prod_table:
                        th = await tr.query_selector("th")
                        td = await tr.query_selector("td")
                        if th and td:
                            key = (await th.text_content()).strip()
                            val = (await td.text_content()).strip()
                            prod_info[key] = val

                    upc = prod_info.get("UPC")

                    desc_el = await detail.query_selector("#product_description + p")
                    description = (await desc_el.text_content()).strip() if desc_el else ""

                    img = await detail.query_selector(".carousel img")
                    img_src = await img.get_attribute("src") if img else None
                    image_url = urljoin(product_url, img_src) if img_src else None

                except Exception:
                    upc = None
                    description = ""
                    prod_info = {}
                    image_url = None

                finally:
                    await detail.close()

                rows.append({
                    "title": title,
                    "product_url": product_url,
                    "price": price,
                    "price_text": price_text,
                    "rating": rating,
                    "stock": stock,
                    "upc": upc,
                    "description": description,
                    "image_url": image_url,
                    **prod_info
                })

                # Print every 20 books for reassurance
                if book_counter % 20 == 0:
                    print(f"   âž¤ Scraped {book_counter} books so farâ€¦ still going strong")

            print(f"âœ… Completed page {page_num}\n")

            # Next page
            next_a = await page.query_selector("li.next > a")
            if not next_a:
                break
            next_href = await next_a.get_attribute("href")
            next_url = urljoin(current_url, next_href)
            current_url = next_url
            page_num += 1

            await page.goto(next_url, timeout=60000)
            await asyncio.sleep(0.4)

        await browser.close()

    print(f"\nðŸŽ‰ All done! Total books scraped = {book_counter}")
    return rows


# Run the scraper
data = asyncio.get_event_loop().run_until_complete(scrape_all_books())

print(f"\nðŸ“¦ Final: collected {len(data)} books!\n")

# Save output
Path("output").mkdir(exist_ok=True)
csv_path = Path("output/books_all.csv")
json_path = Path("output/books_all.json")

if data:
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(data[0].keys()))
        writer.writeheader()
        writer.writerows(data)

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"ðŸ’¾ Saved CSV â†’ {csv_path}")
print(f"ðŸ’¾ Saved JSON â†’ {json_path}")


ðŸ”¥ Starting scraperâ€¦ going to homepage...

ðŸ“„ Scraping page 1 ...
   âž¤ Scraped 20 books so farâ€¦ still going strong
âœ… Completed page 1

ðŸ“„ Scraping page 2 ...
   âž¤ Scraped 40 books so farâ€¦ still going strong
âœ… Completed page 2

ðŸ“„ Scraping page 3 ...
   âž¤ Scraped 60 books so farâ€¦ still going strong
âœ… Completed page 3

ðŸ“„ Scraping page 4 ...
   âž¤ Scraped 80 books so farâ€¦ still going strong
âœ… Completed page 4

ðŸ“„ Scraping page 5 ...
   âž¤ Scraped 100 books so farâ€¦ still going strong
âœ… Completed page 5

ðŸ“„ Scraping page 6 ...
   âž¤ Scraped 120 books so farâ€¦ still going strong
âœ… Completed page 6

ðŸ“„ Scraping page 7 ...
   âž¤ Scraped 140 books so farâ€¦ still going strong
âœ… Completed page 7

ðŸ“„ Scraping page 8 ...
   âž¤ Scraped 160 books so farâ€¦ still going strong
âœ… Completed page 8

ðŸ“„ Scraping page 9 ...
   âž¤ Scraped 180 books so farâ€¦ still going strong
âœ… Completed page 9

ðŸ“„ Scraping page 10 ...
   âž¤ Scraped 200 boo