In [11]:
import time
import os
import pandas as pd
from datetime import datetime
import json
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

In [12]:
links = {
    "Apple iPhone 15": "https://www.amazon.in/dp/B0CHX3TW6X?ref=ods_ucc_kindle_B0CHX2WQLX&th=1",
    "Apple 2023 MacBook Pro (16-inch, Apple M3 Pro chip with 12‑core CPU and 18‑core GPU, 36GB Unified Memory, 512GB) - Silver": "https://amzn.in/d/ib419CQ",
    "OnePlus Nord 4 5G (Mercurial Silver, 8GB RAM, 256GB Storage)": "https://amzn.in/d/2KOJBxa",
    "Sony WH-1000XM5 Best Active Noise Cancelling Wireless Bluetooth Over Ear Headphones with Mic for Clear Calling, up to 40 Hours Battery -Black": "https://amzn.in/d/dP5ATPJ",
}

In [13]:
import os
if not os.path.exists("screenshots"):
    os.makedirs("screenshots")


In [18]:
def scrape_product_data(link, product_name):
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--lang=en")
    options.add_argument("--window-size=1920,1080")

    driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_window_size(1920, 1080)
    driver.get(link)
    product_data = {"reviews": []}
    
    wait = WebDriverWait(driver, 10)
    time.sleep(5)
    retry = 0
    while retry < 3:
        try:
            driver.save_screenshot(f"screenshots/screenshot_{product_name.replace(' ', '_')}_retry_{retry}.png")
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-offscreen")))
            break
        except Exception:
            print("retrying")
            retry += 1
            driver.get(link)
            time.sleep(5)

    driver.save_screenshot(f"screenshots/screenshot_{product_name.replace(' ', '_')}.png")

    try:
        price_elem = driver.find_element(
            By.XPATH,
            '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[3]/span[2]/span[2]',
        )
        product_data["selling price"] = int("".join(price_elem.text.strip().split(",")))
    except:
        product_data["selling price"] = 0

    try:
        original_price = driver.find_element(
            By.XPATH,
            '//*[@id="corePriceDisplay_desktop_feature_div"]/div[2]/span/span[1]/span[2]/span/span[2]',
        ).text
        product_data["original price"] = int("".join(original_price.strip().split(",")))
    except:
        product_data["original price"] = 0

    try:
        discount = driver.find_element(
            By.XPATH,
            '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[2]',
        )
        full_rating_text = discount.get_attribute("innerHTML").strip()
        if " out of 5 stars" in full_rating_text.lower():
            product_data["rating"] = (
                full_rating_text.lower().split(" out of")[0].strip()
            )
        else:
            product_data["discount"] = full_rating_text
    except:
        product_data["discount"] = 0

    try:
        driver.find_element(By.CLASS_NAME, "a-icon-popover").click()
        time.sleep(1)
    except:
        pass

    try:
        # Find all elements with the "data-hook" attribute set to "review-body"
        reviews_elements = driver.find_elements(By.XPATH, "//span[@data-hook='review-body']")
        
        # Iterate through each review element
        for element in reviews_elements:
            # Extract the text content of the review
            review_text = element.text.strip()
            product_data["reviews"].append(review_text)
        
        time.sleep(3)
    except Exception as e:
        print(f"Error fetching reviews: {e}")
        product_data["reviews"] = []



    product_data["date"] = time.strftime("%Y-%m-%d")
    driver.quit()
    return product_data

# Initialize the CSV files if they don't exist
def initialize_csv(file_path, columns):
    if not os.path.exists(file_path):
        df = pd.DataFrame(columns=columns)
        df.to_csv(file_path, index=False)

# Initialize review_data.csv and price_data.csv
initialize_csv("review_data.csv", ["product_name", "reviews", "date"])
initialize_csv("price_data.csv", ["product_name", "Price", "Discount", "Date"])

# Scrape and save data
for product_name, link in links.items():
    product_data = scrape_product_data(link, product_name)

    # Save price data
    price_data = {
        "product_name": product_name,
        "Price": product_data["selling price"],
        "Discount": product_data["discount"],
        "Date": datetime.now().strftime("%Y-%m-%d"),
    }
    # Append the price data to the price_data.csv
    pd.DataFrame([price_data]).to_csv("price_data.csv", mode="a", header=False, index=False)

    # Save review data
    for review in product_data["reviews"]:
        review_data = {
            "product_name": product_name,
            "reviews": review,
            "date": product_data["date"],
        }
        # Append the review data to the review_data.csv
        pd.DataFrame([review_data]).to_csv("review_data.csv", mode="a", header=False, index=False)
