In [108]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import time
from datetime import date, datetime
import numpy as np
import random
import decimal
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import warnings
import dask
from dask import delayed

warnings.filterwarnings('ignore')
from selenium.common.exceptions import TimeoutException

In [109]:
# --- Initial Configuration (Keep as DataFrame) ---
amazon_listing_df = pd.DataFrame({ 
    'category': ['Air Conditioner'], 
    'url': ['https://www.amazon.in/s?rh=n%3A3474656031&fs=true'] 
})
total_pages = 1
amazon_listing_url = amazon_listing_df['url'].unique().tolist()

In [110]:
# Product Price, Ratings and Technical Spec Details Scraping functions
def init_browser():
    """Initialize and return a configured Chrome browser instance"""
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--incognito")
    options.add_argument("start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    return Chrome(options=options)

def delete_cache(driver):
    driver.execute_script("window.open('')")  # Create a separate tab than the main one
    driver.switch_to.window(driver.window_handles[-1])  # Switch window to the second tab
    #driver.get('chrome://settings/clearBrowserData')  # Open your chrome settings.
    #driver.findElement(By.xpath("//*[@id='clearBrowsingDataConfirm']")).click()
    driver.execute_cdp_cmd("Network.clearBrowserCache", {})
    perform_actions(driver, Keys.TAB * 2 + Keys.DOWN * 4 + Keys.TAB * 5 + Keys.ENTER)  # Tab to the time select and key down to say "All Time" then go to the Confirm button and press Enter
    driver.close()  # Close that window
    driver.switch_to.window(driver.window_handles[0])  # Switch Selenium controls to the original tab to continue normal functionality.

def perform_actions(driver, keys):
    actions = ActionChains(driver)
    actions.send_keys(keys)
    time.sleep(1)
    print('Performing Actions!')
    actions.perform()

# ASIN Scraping functions 
def scrape_asin_urls(amazon_listing_url, total_pages):
    """Main function to scrape ASINs (parallelized at PAGE level)"""
    # Generate all page URLs upfront
    all_page_urls = []
    for base_url in amazon_listing_url:
        for page in range(1, total_pages + 1):
            url = f"{base_url}&page={page}"
            all_page_urls.append(url)
    
    # Process ALL pages in parallel (flat structure)
    page_tasks = [delayed(process_page)(url) for url in all_page_urls]
    all_page_dfs = dask.compute(*page_tasks)
    
    # Combine results
    final_collated_urls = pd.concat(all_page_dfs)
    final_collated_urls = final_collated_urls.drop_duplicates('asin')
    final_collated_urls['Product URL'] = 'https://www.amazon.in/dp/' + final_collated_urls['asin'].astype(str)
    return final_collated_urls

def process_page(url):
    """Process a SINGLE page (now takes full URL)"""
    print(url)
    browser = init_browser()
    browser.get(url)
    html = browser.page_source
    delete_cache(browser)
    browser.quit()
    time.sleep(random.uniform(0.2, 0.8))
    
    soup = BeautifulSoup(html, 'html.parser')
    page_asin_df = extract_asin_from_page(soup)
    page_asin_df['listing_url'] = url.split("&page=")[0]  # Extract base URL
    return page_asin_df

def process_base_url(base_url, total_pages):
    """Process all pages for a single base URL (parallelized)"""
    # Parallelize pages
    page_tasks = []
    for page in range(1, total_pages + 1):
        page_task = delayed(process_page)(base_url, page)
        page_tasks.append(page_task)
    
    # Combine results
    return delayed(pd.concat)(page_tasks)

# Product Detail Scraping functions 
def scrape_product_details(final_collated_urls):
    """Parallelize product scraping"""
    product_tasks = [delayed(scrape_single_product)(url) for url in final_collated_urls['Product URL'].tolist()]
    product_dfs = dask.compute(*product_tasks)
    final_scrapped_df = pd.concat(product_dfs)
    final_scrapped_df['Retailer'] = 'Amazon'
    return final_scrapped_df
    
def scrape_single_product(product_url):
    """Scrape details from a single product page"""
    product_df = pd.DataFrame({'Product URL': [product_url]})
    try:
        browser = init_browser()
        browser.get(product_url)
        html = browser.page_source
        delete_cache(browser)
        browser.quit()
        time.sleep(random.uniform(0.5, 1.05))
        
        soup = BeautifulSoup(html, 'html.parser')
        product_df = extract_basic_info(soup, product_df)
        product_df = extract_pricing_info(soup, product_df)
        product_df = extract_ratings_info(soup, product_df)
        product_df = extract_additional_info(soup, product_df)
        product_df = extract_technical_details(soup, product_df)
        
    except Exception as e:
        print(f"Error scraping {product_url}: {str(e)}")
    
    return product_df

def extract_basic_info(soup, df):
    """Extract basic product information"""
    try: df['Title'] = soup.find("span", {'id': 'productTitle'}).text.strip()
    except: df['Title'] = ''
    
    try: df['SKU Product'] = soup.find('input', {'id': 'ASIN'}).get('value')
    except: df['SKU Product'] = ''
    
    df['Scraping Date'] = date.today()
    df['Scraping Time'] = datetime.now()
    return df

def extract_pricing_info(soup, df):
    """Extract pricing-related information"""
    try:
        price = soup.find('span', {'class': 'a-price-whole'}).text.strip()
        df['Selling Price'] = float(re.sub(r'[^\d.]', '', price))
    except: df['Selling Price'] = ''
    
    try:
        mrp = soup.find("span", class_="a-price a-text-price").find("span", class_="a-offscreen").text
        df['MRP'] = float(re.sub(r'[^\d.]', '', mrp))
    except: df['MRP'] = ''
    
    try: df['Discount'] = soup.find("span", class_="a-size-large a-color-price").text.strip().replace("-", "")
    except: df['Discount'] = ''
    
    return df

def extract_ratings_info(soup, df):
    """Extract rating-related information"""
    try: df['no_ratings'] = soup.find("span", {'id': 'acrCustomerReviewText'}).text.strip()
    except: df['no_ratings'] = ''
    
    try: df['avg_rating'] = float(soup.find("span", {'class': 'reviewCountTextLinkedHistogram'}).text.split()[0])
    except: df['avg_rating'] = ''
    
    return df

def extract_additional_info(soup, df):
    """Extract additional product information"""
    try: df['Stock Status'] = soup.find("div", {'id': 'availabilityInsideBuyBox_feature_div'}).text.strip()
    except: df['Stock Status'] = ''
    
    try: df['Seller'] = soup.find("a", {'id': 'sellerProfileTriggerId'}).text.strip()
    except: df['Seller'] = ''
    
    return df

def extract_technical_details(soup, df):
    """Extract technical details from product tables"""
    try:
        table = soup.find("div", {"id": "productOverview_feature_div"}).find(
            "table", class_="a-normal a-spacing-micro"
        )
        data = []
        for tr in table.find_all("tr"):
            row = [td.text for td in tr.find_all("td")]
            data.append(row)

        table_df = pd.DataFrame(data)
        table_df = table_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
        table_df.columns = ["Attribute_Name", "Attribute_Value"]
        table_df = table_df.reset_index(drop=True)
        df = pd.concat([df, table_df], axis=1)
        col_subset = list(set(df.columns.tolist()) - set(["Attribute_Name", "Attribute_Value"]))
        df[col_subset] = df[col_subset].fillna(method="ffill")
    except Exception as e:
        print(f"Error extracting from productOverview_feature_div: {e}")

    try:
        table = soup.find("table", {"id": "productDetails_detailBullets_sections1"})
        final_product_spec_df = pd.DataFrame()
        for tr in table.find_all("tr"):
            product_value = tr.find("td").text.strip()
            product_name = tr.find("th").text.strip()
            product_spec_df = pd.DataFrame(
                {"Attribute_Name": [product_name], "Attribute_Value": [product_value]}
            )
            final_product_spec_df = pd.concat([final_product_spec_df, product_spec_df], ignore_index=True)
        df = pd.concat([df, final_product_spec_df], ignore_index=True)
        cols = df.drop(columns=["Attribute_Name", "Attribute_Value"]).columns.tolist()
        df[cols] = df[cols].fillna(method="ffill")
    except Exception as e:
        print(f"Error extracting from productDetails_detailBullets_sections1: {e}")

    try:
        table = soup.find("table", {"id": "productDetails_techSpec_section_1"})
        final_product_spec_df = pd.DataFrame()
        for tr in table.find_all("tr"):
            product_value = tr.find("td").text.strip()
            product_name = tr.find("th").text.strip()
            product_spec_df = pd.DataFrame(
                {"Attribute_Name": [product_name], "Attribute_Value": [product_value]}
            )
            final_product_spec_df = pd.concat([final_product_spec_df, product_spec_df], ignore_index=True)
        df = pd.concat([df, final_product_spec_df], ignore_index=True)
        cols = df.drop(columns=["Attribute_Name", "Attribute_Value"]).columns.tolist()
        df[cols] = df[cols].fillna(method="ffill")
    except Exception as e:
        print(f"Error extracting from productDetails_techSpec_section_1: {e}")

    return df

In [111]:
final_collated_urls = scrape_asin_urls(amazon_listing_url, total_pages)
final_collated_urls.head()

Unnamed: 0,asin,product default order number,listing_url,Product URL
0,B0BK1KS6ZD,2,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0BK1KS6ZD
1,B0DS2DX5ZP,3,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0DS2DX5ZP
2,B0CWVDXYX1,4,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0CWVDXYX1
3,B0DQQ4XDBB,6,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0DQQ4XDBB
4,B09R4RYCJ4,7,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B09R4RYCJ4


In [112]:
# Product Detail Scraping
final_scrapped_df = scrape_product_details(final_collated_urls)
final_scrapped_df.head()

Unnamed: 0,Product URL,Title,SKU Product,Scraping Date,Scraping Time,Selling Price,MRP,Discount,no_ratings,avg_rating,Stock Status,Seller,Attribute_Name,Attribute_Value,Retailer
0,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-03-01,2025-03-01 19:30:11.442388,36990.0,58400.0,,"3,954 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,Brand,Daikin,Amazon
1,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-03-01,2025-03-01 19:30:11.442388,36990.0,58400.0,,"3,954 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,Capacity,1.5 Tons,Amazon
2,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-03-01,2025-03-01 19:30:11.442388,36990.0,58400.0,,"3,954 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,Cooling Power,17100 British Thermal Units,Amazon
3,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-03-01,2025-03-01 19:30:11.442388,36990.0,58400.0,,"3,954 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,Special Feature,"High Ambient Operation upto 52°C, 3D Airflow, ...",Amazon
4,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-03-01,2025-03-01 19:30:11.442388,36990.0,58400.0,,"3,954 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,Product Dimensions,22.9D x 88.5W x 29.8H Centimeters,Amazon
