In [None]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Try

In [None]:
# CSV file containing book URLs
df = pd.read_csv("jarir_bestsellers.csv", names=["book_link"])

## try

In [None]:
INPUT_CSV = "jarir_bestsellers.csv"
all_books_df = pd.DataFrame()

# Headers to mimic a browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0"
}

# Initialize Selenium WebDriver
driver = webdriver.Chrome()

def get_book_data(url):
    """Scrapes book data from Jarir's website."""
    book_details = {}

    # Scrape static content using Requests + BeautifulSoup
    response = requests.get(url, headers=HEADERS)
    response.encoding = "utf-8"  # Force UTF-8 encoding
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract Title
    book_details["Title"] = soup.find("h2", class_="product-title__title").text.strip() if soup.find("h2", class_="product-title__title") else "Null"

    # Extract Price
    price_container = soup.find("span", class_="price_alignment")
    if price_container:
        value = price_container.find_all("span")[-1].text.strip() if price_container.find_all("span") else "Null"
        book_details["Price"] = value
    else:
        book_details["Price"] = "Null"

    # Use Selenium for dynamically loaded elements (Rating & Reviews)
    driver.get(url)
    time.sleep(3)  # Allow time for JavaScript to load

    # Extract Rating
    try:
        rating_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "tf-rating"))
        )
        book_details["Rating"] = rating_element.text.strip()
    except:
        book_details["Rating"] = "Null"

    # Extract Number of Reviews
    try:
        num_reviews_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "tf-count"))
        )
        book_details["Num Of Reviews"] = num_reviews_element.text.strip()
    except:
        book_details["Num Of Reviews"] = "Null"

    # Extract Author
    author_tag = soup.find("b", string="Author:")
    book_details["Author"] = author_tag.find_next("span", class_="cl-blue").text.strip() if author_tag else "Null"

    # Extract Book Type (Format)
    format_tag = soup.find("b", string="Format:")
    book_details["Book Type"] = format_tag.find_next("span").text.strip() if format_tag else "Null"

    # Extract Genre (Book Classification)
    book_classification = soup.find("b", string="Book classification:")
    if book_classification:
        genres = [span.text.strip() for span in book_classification.find_next("span").find_all("span", class_="cl-blue") if span.text.strip()]
        book_details["Genre"] = ", ".join(genres) if genres else "Null"
    else:
        book_details["Genre"] = "Null"

    # Extract High-Quality Cover Image
    image_tags = soup.find_all("img", class_="image image--contain")
    if len(image_tags) > 1:
        raw_image_url = image_tags[1]["src"]
        # Modify the URL to get better quality (replace width=54 with width=350)
        book_details["Cover Image"] = raw_image_url.replace("width=54", "width=350")
    else:
        book_details["Cover Image"] = "No Image"

    return book_details


# Read URLs from CSV & Scrape Data
i=1
df = pd.read_csv(INPUT_CSV, names=["book_link"], encoding="utf-8")  # Adjust nrows as needed
for url in df["book_link"].dropna():  # Drop NaN values
    print(i, f"Scraping: {url}")
    book_info = get_book_data(url)
    i+=1

    # Append the scraped data to the DataFrame
    all_books_df = pd.concat([all_books_df, pd.DataFrame([book_info])], ignore_index=True)
    time.sleep(3)  # Delay to prevent request blocking

# Close Selenium WebDriver
driver.quit()

# Drop the URL column (not needed in final output)
all_books_df.drop(columns=["URL"], inplace=True, errors="ignore")