In [34]:
import requests
import pandas as pd
import time
import re
from bs4 import BeautifulSoup
from difflib import SequenceMatcher

BASE_URL = "https://books.toscrape.com/"
CATALOGUE_URL = BASE_URL + "catalogue/page-{}.html"
GOOGLE_BOOKS_API = "https://www.googleapis.com/books/v1/volumes"
BOOKSRUN_API_KEY = "5xxn82dd5zz7hctfjf90"
BOOKSRUN_API_URL = "https://booksrun.com/api/v3/price/buy/{}?key={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}

SIMILARITY_THRESHOLD = 0.7
SLEEP_TIME = 1.5
OUTPUT_FILE = "booksrun_price_comparison.csv"
TOTAL_PAGES = 50

def clean(text):
    return re.sub(r"[^a-z0-9 ]", "", text.lower()).strip()

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def clean_price(price_text):
    return float(re.sub(r"[^\d.]", "", price_text))

#
def scrape_books():
    books = []
    for page in range(1, TOTAL_PAGES + 1):
        print(f"Scraping page {page}...")
        r = requests.get(CATALOGUE_URL.format(page), headers=HEADERS)
        if r.status_code != 200:
            continue

        soup = BeautifulSoup(r.text, "html.parser")
        for article in soup.select("article.product_pod"):
            title = article.h3.a["title"]
            price_text = article.select_one("p.price_color").text
            our_price = clean_price(price_text)

            books.append({
                "title": title,
                "our_price": our_price
            })

    print(f"Total books scraped: {len(books)}")
    return books


def google_books_search(title):
    params = {"q": f"intitle:{title}", "maxResults": 5}
    r = requests.get(GOOGLE_BOOKS_API, params=params, headers=HEADERS)
    if r.status_code != 200:
        return []
    return r.json().get("items", [])

def extract_isbn13(item):
    isbns = []
    for i in item.get("volumeInfo", {}).get("industryIdentifiers", []):
        if i.get("type") == "ISBN_13":
            isbns.append(i.get("identifier"))
    return isbns


def fetch_booksrun_prices(isbn):
    url = BOOKSRUN_API_URL.format(isbn, BOOKSRUN_API_KEY)
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        data = r.json()

        if "result" not in data or "offers" not in data["result"]:
            return None

        offers = data["result"]["offers"]

        booksrun = offers.get("booksrun", {})
        if isinstance(booksrun, list):
            booksrun = booksrun[0] if booksrun else {}
        if not isinstance(booksrun, dict):
            booksrun = {}

        marketplace = offers.get("marketplace", {})
        if isinstance(marketplace, list):
            marketplace = marketplace[0] if marketplace else {}
        if not isinstance(marketplace, dict):
            marketplace = {}

        used_price = None
        used = booksrun.get("used")
        if isinstance(used, dict):
            used_price = used.get("price")

        new_price = booksrun.get("new")
        if not isinstance(new_price, (int, float)):
            new_price = None

        rental_price = booksrun.get("rent")
        if not isinstance(rental_price, (int, float)):
            rental_price = None

        marketplace_price = None
        market_used = marketplace.get("used")
        if isinstance(market_used, dict):
            marketplace_price = market_used.get("price")

        prices = [p for p in [used_price, new_price, rental_price, marketplace_price] if isinstance(p, (int, float))]
        if not prices:
            return None

        return {
            "isbn13": isbn,
            "competitor_price": min(prices),
            "used_price": used_price,
            "new_price": new_price,
            "rental_price": rental_price,
            "marketplace_price": marketplace_price
        }

    except Exception as e:
        print(f"BooksRun error for ISBN {isbn}: {e}")

    return None


def main():
    books = scrape_books()
    results = []
    seen_isbns = set()

    for idx, book in enumerate(books, 1):
        title = book["title"]
        our_price = book["our_price"]

        print(f"\nProcessing ({idx}/{len(books)}): {title}")
        clean_title_str = clean(title)

        google_items = google_books_search(title)
        for item in google_items:
            g_title = clean(item.get("volumeInfo", {}).get("title", ""))
            if similarity(clean_title_str, g_title) < SIMILARITY_THRESHOLD:
                continue

            binding = item.get("volumeInfo", {}).get("printType", "UNKNOWN")

            for isbn in extract_isbn13(item):
                if isbn in seen_isbns:
                    continue

                prices = fetch_booksrun_prices(isbn)
                if prices:
                    competitor_price = prices["competitor_price"]

                    # ===== MARKET-ADJUSTED PRICING STRATEGY (FIXED) =====
                    PERCENT_CHANGE = 0.05  # 5% undercut

                    if competitor_price > our_price:
                        # Competitor price higher → undercut competitor by 5%
                        action = "increase_price"
                        adjusted_price = round(competitor_price * (1 - PERCENT_CHANGE), 2)
                    else:
                        # Competitor price lower or equal → stay competitive
                        action = "reduce_price"
                        adjusted_price = round(competitor_price * (1 - PERCENT_CHANGE), 2)

                    profit = round(adjusted_price - our_price, 2)

                    results.append({
                        "title": title,
                        "binding": binding,
                        "our_price": our_price,
                        **prices,
                        "pricing_action": action,
                        "adjusted_price": adjusted_price,
                        "profit": profit
                    })

                    seen_isbns.add(isbn)
                    print(f"✔ PRICE FOUND → ISBN {isbn}")
                    break

        time.sleep(SLEEP_TIME)

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_FILE, index=False)

    print(f"\n✅ Finished!")
    print(f"✅ Total books with prices: {len(df)}")
    print(f"✅ CSV saved: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Total boo