In [1]:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests

In [2]:
from concurrent.futures import ThreadPoolExecutor

def extract_product_links(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    links = []
    product_cards = soup.find_all("article", class_="product_pod")

    for card in product_cards:
        relative_link = card.find("h3").find("a")["href"]
        full_link = urljoin(page_url, relative_link)
        links.append(full_link)

    print(f"Scraped {len(links)} products from {page_url}")
    return links


In [3]:
index_page_url = "https://books.toscrape.com/catalogue/page-1.html"

for i in range(1, 51):
    page_urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 51)]

# Step 2: parallel scraping using ThreadPool.map
product_links = []

with ThreadPoolExecutor(max_workers=16) as executor:
    results = executor.map(extract_product_links, page_urls)

    for page_links in results:
        product_links.extend(page_links)

print(f"\nTotal product links collected: {len(product_links)}")


Scraped 20 products from https://books.toscrape.com/catalogue/page-10.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-3.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-2.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-9.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-16.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-6.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-12.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-1.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-11.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-15.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-7.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-14.html
Scraped 20 products from https://books.toscrape.com/catalogue/page-5.html
Scraped 20 products from https:/

In [4]:
def get_product_details(product_url):
    response = requests.get(product_url)
    product_soup = BeautifulSoup(response.content, "html.parser")

    image_link = product_soup.find("div", class_="item active").find("img")["src"]
    category = product_soup.find("ul", class_="breadcrumb").find_all("li")[2].get_text().strip()
    title = product_soup.find("div", class_="product_main").find("h1").get_text().strip()
    price = product_soup.find("p", class_="price_color").get_text().strip()
    availability = product_soup.find("p", class_="instock availability").get_text().strip()
    rating = product_soup.find("p", class_="star-rating")["class"][1]
    description = product_soup.find("div", class_="sub-header").find_next("p").get_text().strip()
    table = product_soup.find("table", class_="table table-striped")
    rows = table.find_all("tr")
    product_info = {}
    for row in rows:
        header = row.find("th").get_text().strip()
        value = row.find("td").get_text().strip()
        product_info[header] = value

    data =  {
        "product_link": product_url,
        "image_link": urljoin(product_url, image_link),
        "category": category,
        "title": title,
        "price": price,
        "availability": availability,
        "rating": rating,
        "description": description,
        "product_info": product_info
    }

    return data


In [5]:
product_details = []

for i in range(0, len(product_links), 80):
    batch_links = product_links[i:i+80]

    with ThreadPoolExecutor(max_workers=16) as executor:
        details_results = executor.map(get_product_details, batch_links)

        for product_data in details_results:
            product_details.append(product_data)
    
    print(f"Processed batch {i//80 + 1}: Collected details for {len(batch_links)} products. Remaining: {len(product_links) - (i + len(batch_links))}")

Processed batch 1: Collected details for 80 products. Remaining: 920
Processed batch 2: Collected details for 80 products. Remaining: 840
Processed batch 3: Collected details for 80 products. Remaining: 760
Processed batch 4: Collected details for 80 products. Remaining: 680
Processed batch 5: Collected details for 80 products. Remaining: 600
Processed batch 6: Collected details for 80 products. Remaining: 520
Processed batch 7: Collected details for 80 products. Remaining: 440
Processed batch 8: Collected details for 80 products. Remaining: 360
Processed batch 9: Collected details for 80 products. Remaining: 280
Processed batch 10: Collected details for 80 products. Remaining: 200
Processed batch 11: Collected details for 80 products. Remaining: 120
Processed batch 12: Collected details for 80 products. Remaining: 40
Processed batch 13: Collected details for 40 products. Remaining: 0


In [8]:
print(product_details[0])

{'product_link': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html', 'image_link': 'https://books.toscrape.com/media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg', 'category': 'Poetry', 'title': 'A Light in the Attic', 'price': '£51.77', 'availability': 'In stock (22 available)', 'rating': 'Three', 'description': "It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced

In [None]:
import psycopg2

conn = psycopg2.connect(
    dbname="bs4_books_scraped",
    user="rohitagarwal",
    password="rohit2610",
    host="localhost",
    port="5432"
)

cur = conn.cursor()

In [None]:
create_table_query = """
CREATE TABLE IF NOT EXISTS books (
    id SERIAL PRIMARY KEY,
    product_link TEXT,
    image_link TEXT,
    category TEXT,
    title TEXT,
    price TEXT,
    availability TEXT,
    rating TEXT,
    description TEXT,
    upc TEXT,
    product_type TEXT,
    price_excl_tax TEXT,
    price_incl_tax TEXT,
    tax TEXT,
    number_of_reviews TEXT
);
"""

cur.execute(create_table_query)
conn.commit()

In [None]:
insert_query = """
INSERT INTO books (
    product_link, image_link, category, title, price, availability, rating, description, upc, product_type, price_excl_tax, price_incl_tax, tax, number_of_reviews
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""

for book in product_details:
    product_info = book["product_info"]
    cur.execute(insert_query, (
        book["product_link"],
        book["image_link"],
        book["category"],
        book["title"],
        book["price"],
        book["availability"],
        book["rating"],
        book["description"],
        product_info.get("UPC", ""),
        product_info.get("Product Type", ""),
        product_info.get("Price (excl. tax)", ""),
        product_info.get("Price (incl. tax)", ""),
        product_info.get("Tax", ""),
        product_info.get("Number of reviews", "")
    ))

    print(f"Inserted book: {book['title']}")
conn.commit()

In [None]:
delete_query = "DELETE FROM books;"
cur.execute(delete_query)
conn.commit()

In [40]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

snapdeal_page = requests.get("https://www.snapdeal.com/products/electronics-bluetooth-speakers?sort=plrty", headers=headers)
snapdeal_soup = BeautifulSoup(snapdeal_page.text, "html.parser")

with open("snapdeal_page.html", "w", encoding="utf-8") as f:
    f.write(snapdeal_soup.prettify())

In [52]:
products = snapdeal_soup.find_all("div", class_="col-xs-6 favDp product-tuple-listing js-tuple")

print(len(products))
products_links = []

for product in products:
    link = product.find("a", class_="dp-widget-link noUdLine")["href"]
    products_links.append(link)

len(products_links)

19


19