In [1]:
import dotenv
import os
from playwright.async_api import async_playwright
import time
import random
import json

In [2]:
# Configurations
amazon_home = "https://www.amazon.com/"
products_to_search = ['hello bello']
# , 'MyPura', 'Rascals', 'believe baby', 'parasol', 'millie moon', 'ecoiginals', 'everylife'
max_items = 20 # if product name is given, this is the maximum number of items to search for

In [3]:
# load environment variables
env = dotenv.load_dotenv()

In [4]:
# Launch Browser
async def start_browser():
    p = await async_playwright().start()
    browser = await p.chromium.launch(
        headless=False
        # executable_path="/Users/sahilhadke/Library/Caches/ms-playwright/firefox-1471/firefox/firefox"
    )
    context = await browser.new_context()
    page = await context.new_page()
    return p, browser, context, page

# Execute this block to start the browser
p, browser, context, page = await start_browser()
await page.goto(amazon_home)

<Response url='https://www.amazon.com/' request=<Request url='https://www.amazon.com/' method='GET'>>

In [8]:
# Login Manually
async def login():
    # click on sign in button
    sign_in_button = await page.wait_for_selector("//a[@data-nav-role='signin']")
    await sign_in_button.click()

    # enter email
    email_input = await page.wait_for_selector("input[type='email']")
    await email_input.fill(os.getenv("AMAZON_EMAIL"))

    # click on continue
    continue_button = await page.wait_for_selector("//input[@id='continue']")

    # click on continue
    await continue_button.click()

    # enter password
    password_input = await page.wait_for_selector("input[type='password']")
    await password_input.fill(os.getenv("AMAZON_PASSWORD"))

    # click on sign in
    sign_in_button = await page.wait_for_selector("//input[@id='signInSubmit']")
    await sign_in_button.click()

# Execute this block to login
await login()

CancelledError: 

In [6]:
# create output json file
def save_reviews(reviews, filename=f"reviews.json"):
    """Save reviews to JSON file after every 50 reviews."""
    if os.path.exists(filename):
        with open(filename, "r") as file:
            try:
                existing_data = json.load(file)
            except json.JSONDecodeError:
                existing_data = {}
    else:
        existing_data = {}

    existing_data.update(reviews)  # Append new reviews to existing data

    with open(filename, "w") as file:
        json.dump(existing_data, file, indent=4)
        
async def get_reviews(url, product_keyword=''):
    stars_to_scrape = 1
    review_dictionary = {}

    await page.goto(url)

    # Get product name
    product_name = await page.query_selector("span#productTitle")
    product_name = await product_name.inner_text() if product_name else ""

    review_dictionary["product_name"] = product_name
    review_dictionary["url"] = url.split("?")[0]

    if product_keyword != '' and product_keyword.lower() not in product_name.lower():
        return review_dictionary

    print(f"Product Name = {product_name}")

    # Scroll to Reviews Section
    for _ in range(20):
        await page.evaluate("window.scrollBy(0, 1000)")
        time.sleep(0.5)

        review_more_link = await page.query_selector("//a[@data-hook='see-all-reviews-link-foot']")
        if review_more_link:
            await review_more_link.scroll_into_view_if_needed()
            await review_more_link.click()
            break

    reviews = []
    max_reviews_to_scrape = float("inf")
    review_id = 1

    while stars_to_scrape <= 5:  # Iterate through 1-star to 5-star reviews
        print(f"Scraping {stars_to_scrape}-star reviews")

        # Select the star rating filter
        for _ in range(20):
            await page.evaluate("window.scrollBy(0, 1000)")
            time.sleep(0.5)

            stars_filter_select = await page.query_selector("//div[@class='star-rating-select']")
            if stars_filter_select:
                await stars_filter_select.click()

                # Select the correct star rating
                stars_xpath = f"//li[@aria-labelledby='star-count-dropdown_{stars_to_scrape}']"
                current_star = await page.query_selector(stars_xpath)
                if current_star:
                    await current_star.click()
                else:
                    print(f"Star rating {stars_to_scrape} not found.")
                break

        review_index = 1
        current_page = 1
        review_id = 1

        while review_id <= max_reviews_to_scrape:
            print(f'Getting review {review_index} on page {current_page}')

            current_review = {}

            # Name
            name_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//a[@class='a-profile']/div[2]/span"
            name = await page.query_selector(name_xpath)
            if not name:
                next_page_button = await page.query_selector("//ul[@class='a-pagination'][1]//li[@class='a-last']")
                if next_page_button:
                    await next_page_button.click()
                    time.sleep(random.randint(1, 3))
                    review_index = 1
                    current_page += 1
                    print(f"Moving to next page")
                    continue
                else:
                    print(f"Finished scraping {stars_to_scrape}-star reviews.")
                    break

            name = await name.inner_text() if name else ""
            current_review["name"] = name

            # Title
            title_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//a[@data-hook='review-title']//span[2]"
            title = await page.query_selector(title_xpath)
            # scroll
            # await title.scroll_into_view_if_needed()
            title = await title.inner_text() if title else ""


            title_hash = str(hash(title + name)).replace("-", "0")
            # current_review["review_id"] = title_hash  
            current_review["title"] = title

            # Date
            date_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//span[@data-hook='review-date']"
            date = await page.query_selector(date_xpath)
            date = await date.inner_text() if date else ""
            current_review["date"] = date

            # Stars
            stars_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//i[@data-hook='review-star-rating']"
            stars = await page.query_selector(stars_xpath)
            stars = await stars.inner_text() if stars else ""
            current_review["stars"] = str(stars).split(" ")[0]

            # Product Specs
            product_specs_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//a[@data-hook='format-strip']"
            product_specs = await page.query_selector(product_specs_xpath)
            product_specs = await product_specs.inner_text() if product_specs else ""
            current_review["product_specs"] = product_specs

            # Verified Purchase
            verified_purchase = await page.query_selector(f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//span[@data-hook='avp-badge']")
            current_review["verified_purchase"] = True if verified_purchase else False

            # Read More Button
            read_more_button = await page.query_selector(f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//a[@aria-label='Read more of this review']")
            if read_more_button:
                await read_more_button.click()
                time.sleep(1)  # Ensure content loads

            # Helpful Statement
            helpful_statement_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//span[@data-hook='helpful-vote-statement']"
            helpful_statement = await page.query_selector(helpful_statement_xpath)
            helpful_statement = await helpful_statement.inner_text() if helpful_statement else ""
            current_review["helpful_statement"] = helpful_statement

            # Review Text
            review_xpath = f"(//div[@id='cm_cr-review_list']//ul//li)[{review_index}]//*//span[@data-hook='review-body']"
            review = await page.query_selector(review_xpath)
            review = await review.inner_text() if review else ""
            current_review["review"] = review

            reviews.append(current_review)
            review_index += 1
            review_id += 1
            time.sleep(random.randint(0, 1))

        stars_to_scrape += 1  # Move to the next star rating

    review_dictionary["reviews"] = reviews
    return review_dictionary


In [7]:
# Scrape products with given name
for product_to_search in products_to_search:

    # product_to_search = product_to_search + ' diapers'

    product_reviews = {
        "products": []
    }

    product_id = 1

    while product_id <= max_items:

        # go to product search page
        await page.goto(f"https://www.amazon.com/s?k={product_to_search.replace(' ', '+')}+baby+diaper")   

        # select sort by best sellers
        if True:
            try:
                sort_select_xpath = "//select[@id='s-result-sort-select']//..//..//..//form"
                sort_select = await page.query_selector(sort_select_xpath)
                await sort_select.click()

                # best sellers xpath
                best_sellers_xpath = "//a[@id='s-result-sort-select_3']"
                best_sellers = await page.query_selector(best_sellers_xpath)
                await best_sellers.click()
            except:
                print("Could not sort by best sellers")
            time.sleep(random.randint(2, 3))

        # click on product
        product_listing = f"(//div[@role='listitem'][{product_id}])[1]//a"
        product = await page.query_selector(product_listing)
        if product:
            await product.click()
        else:
            print(f"Product not found: id = {product_id}")
            product_id += 1
            continue

        # get page url
        url = page.url
        time.sleep(random.randint(1, 3))

        current_product_reviews = await get_reviews(url, product_to_search)
        if 'reviews' not in current_product_reviews:
            max_items += 1
            product_id += 1
            continue
        
        product_reviews['products'].append(current_product_reviews)

        # update in json file
        save_reviews(product_reviews, filename=f"{product_to_search}_reviews.json")

        product_id += 1

Product Name = Hello Bello Premium Swim Diapers I Affordable and Eco-Friendly Disposable Swim Dipes for Babies and Kids I Size Medium - Diaper Size 4-5
Scraping 1-star reviews
Getting review 1 on page 1
Getting review 2 on page 1
Getting review 3 on page 1
Getting review 4 on page 1
Getting review 5 on page 1
Getting review 6 on page 1
Getting review 7 on page 1
Getting review 8 on page 1
Getting review 9 on page 1
Getting review 10 on page 1
Getting review 11 on page 1
Moving to next page
Getting review 1 on page 2
Getting review 2 on page 2
Getting review 3 on page 2
Getting review 4 on page 2
Getting review 5 on page 2
Getting review 6 on page 2
Moving to next page
Getting review 1 on page 3
Getting review 2 on page 3
Getting review 3 on page 3
Getting review 4 on page 3
Getting review 5 on page 3
Getting review 6 on page 3
Getting review 7 on page 3
Getting review 8 on page 3
Getting review 9 on page 3
Getting review 10 on page 3
Getting review 11 on page 3
Moving to next page
Gett

In [10]:
# close
await browser.close()

Exception: Browser.close: Connection closed while reading from the driver