In [5]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import time
from datetime import date, datetime
import numpy as np
import random
import decimal
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys


In [None]:
#Config : Provide Category to extract and pagination
amazon_listing_url = pd.DataFrame({ 'category' : ['Air Conditioner'] , 'url' : ['https://www.amazon.in/s?rh=n%3A3474656031&fs=true'] }) 
total_pages = 2

In [51]:

def init_browser():
    """Initialize and return a configured Chrome browser instance"""
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--incognito")
    options.add_argument("start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    return Chrome(options=options)

def delete_cache(driver):
    driver.execute_script("window.open('')")  # Create a separate tab than the main one
    driver.switch_to.window(driver.window_handles[-1])  # Switch window to the second tab
    #driver.get('chrome://settings/clearBrowserData')  # Open your chrome settings.
    #driver.findElement(By.xpath("//*[@id='clearBrowsingDataConfirm']")).click()
    driver.execute_cdp_cmd("Network.clearBrowserCache", {})
   #perform_actions(driver, Keys.TAB * 2 + Keys.DOWN * 4 + Keys.TAB * 5 + Keys.ENTER)  # Tab to the time select and key down to say "All Time" then go to the Confirm button and press Enter
    driver.close()  # Close that window
    driver.switch_to.window(driver.window_handles[0])  # Switch Selenium controls to the original tab to continue normal functionality.

def perform_actions(driver, keys):
    actions = ActionChains(driver)
    actions.send_keys(keys)
    time.sleep(1)
    print('Performing Actions!')
    actions.perform()

# ASIN Scraping functions
def scrape_asin_urls(amazon_listing_url, total_pages):
    """Main function to scrape ASINs from listing pages"""
    final_collated_urls = pd.DataFrame()
    pages = total_pages
    for base_url in amazon_listing_url:
        all_product_urls = process_base_url(base_url, pages)
        final_collated_urls = pd.concat([final_collated_urls, all_product_urls])
        final_collated_urls = final_collated_urls.drop_duplicates('asin')
    
    final_collated_urls['Product URL'] = 'https://www.amazon.in/dp/' + final_collated_urls['asin'].astype(str)
    return final_collated_urls

def process_base_url(base_url, total_pages):
    """Process all pages for a single base URL"""
    all_product_urls = pd.DataFrame()
    
    for page in range(1, total_pages + 1):
        url = base_url + "&page=" + str(page)          
        print(url)
        browser = init_browser()
        browser.get(url)
        html = browser.page_source
        delete_cache(browser)
        browser.quit()
        time.sleep(random.uniform(0.2, 0.8))
        
        soup = BeautifulSoup(html, 'html.parser')
        page_asin_df = extract_asin_from_page(soup)
        page_asin_df['listing_url'] = base_url
        all_product_urls = pd.concat([all_product_urls, page_asin_df])
    
    return all_product_urls.reset_index(drop=True)

def extract_asin_from_page(soup):
    """Extract ASINs from a single page"""
    product_urls = []
    for div in soup.find_all("div"):
        if (data_asin := div.get("data-asin")) and (data_index := div.get("data-index")):
            product_urls.append({'asin': data_asin, 'product default order number': data_index})
    return pd.DataFrame(product_urls)

# Product Detail Scraping functions
def scrape_product_details(final_collated_urls):
    """Main function to scrape product details from product pages"""
    final_scrapped_df = pd.DataFrame()
    
    for product_url in final_collated_urls['Product URL'].tolist():
        product_df = scrape_single_product(product_url)
        product_df['Retailer'] = 'Amazon'
        final_scrapped_df = pd.concat([final_scrapped_df, product_df])
    
    return final_scrapped_df

def scrape_single_product(product_url):
    """Scrape details from a single product page"""
    product_df = pd.DataFrame({'Product URL': [product_url]})
    try:
        browser = init_browser()
        browser.get(product_url)
        html = browser.page_source
        delete_cache(browser)
        browser.quit()
        time.sleep(random.uniform(0.5, 1.05))
        
        soup = BeautifulSoup(html, 'html.parser')
        product_df = extract_basic_info(soup, product_df)
        product_df = extract_pricing_info(soup, product_df)
        product_df = extract_ratings_info(soup, product_df)
        product_df = extract_additional_info(soup, product_df)
        product_df = extract_technical_details(soup, product_df)
        
    except Exception as e:
        print(f"Error scraping {product_url}: {str(e)}")
    
    return product_df

def extract_basic_info(soup, df):
    """Extract basic product information"""
    try: df['Title'] = soup.find("span", {'id': 'productTitle'}).text.strip()
    except: df['Title'] = ''
    
    try: df['SKU Product'] = soup.find('input', {'id': 'ASIN'}).get('value')
    except: df['SKU Product'] = ''
    
    df['Scraping Date'] = date.today()
    df['Scraping Time'] = datetime.now()
    return df

def extract_pricing_info(soup, df):
    """Extract pricing-related information"""
    try:
        price = soup.find('span', {'class': 'a-price-whole'}).text.strip()
        df['Selling Price'] = float(re.sub(r'[^\d.]', '', price))
    except: df['Selling Price'] = ''
    
    try:
        mrp = soup.find("span", class_="a-price a-text-price").find("span", class_="a-offscreen").text
        df['MRP'] = float(re.sub(r'[^\d.]', '', mrp))
    except: df['MRP'] = ''
    
    try: df['Discount'] = soup.find("span", class_="a-size-large a-color-price").text.strip().replace("-", "")
    except: df['Discount'] = ''
    
    return df

def extract_ratings_info(soup, df):
    """Extract rating-related information"""
    try: df['no_ratings'] = soup.find("span", {'id': 'acrCustomerReviewText'}).text.strip()
    except: df['no_ratings'] = ''
    
    try: df['avg_rating'] = float(soup.find("span", {'class': 'reviewCountTextLinkedHistogram'}).text.split()[0])
    except: df['avg_rating'] = ''
    
    return df

def extract_additional_info(soup, df):
    """Extract additional product information"""
    try: df['Stock Status'] = soup.find("div", {'id': 'availabilityInsideBuyBox_feature_div'}).text.strip()
    except: df['Stock Status'] = ''
    
    try: df['Seller'] = soup.find("a", {'id': 'sellerProfileTriggerId'}).text.strip()
    except: df['Seller'] = ''
    
    return df

def extract_technical_details(soup, df):
    """Extract technical details from product tables"""
    tables = [
        ('productOverview_feature_div', 'a-normal a-spacing-micro'),
        ('productDetails_detailBullets_sections1', None),
        ('productDetails_techSpec_section_1', None)
    ]
    
    for table_id, table_class in tables:
        try:
            table = soup.find('div', {'id': table_id}).find('table', class_=table_class)
            temp_df = pd.DataFrame([
                [th.text.strip(), td.text.strip()] 
                for tr in table.find_all('tr') 
                for th, td in zip(tr.find_all('th'), tr.find_all('td'))
            ], columns=['Attribute_Name', 'Attribute_Value'])
            
            df = pd.concat([df, temp_df], axis=1).ffill()
        except: continue
    
    return df



In [None]:
total_pages = 2
# Define unused but original variables

# ASIN Scraping
final_collated_urls = scrape_asin_urls(amazon_listing_url, total_pages)
final_collated_urls.to_csv('product_urls_amazon.csv', index=False)

# Product Detail Scraping
final_scrapped_df = scrape_product_details(final_collated_urls)
final_scrapped_df.to_csv('amazon_scrapped_data.csv', index=False)


In [38]:
amazon_listing_url

Unnamed: 0,category,url
0,Air Conditioner,https://www.amazon.in/s?rh=n%3A3474656031&fs=true


In [47]:
# ASIN Scraping
amazon_listing_url = amazon_listing_url['url'].unique().tolist()
final_collated_urls = scrape_asin_urls(amazon_listing_url, total_pages)
#final_collated_urls.to_csv('product_urls_amazon.csv', index=False)    

https://www.amazon.in/s?rh=n%3A3474656031&fs=true&&page=1
https://www.amazon.in/s?rh=n%3A3474656031&fs=true&&page=2


In [49]:
final_collated_urls.head()

Unnamed: 0,asin,product default order number,listing_url,Product URL
0,B0BK1KS6ZD,2,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0BK1KS6ZD
1,B0DS2DX5ZP,3,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0DS2DX5ZP
2,B0DQQ4XDBB,4,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0DQQ4XDBB
3,B0CWVDXYX1,6,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0CWVDXYX1
4,B0DRG7M72Z,7,https://www.amazon.in/s?rh=n%3A3474656031&fs=true,https://www.amazon.in/dp/B0DRG7M72Z


In [55]:
# Product Detail Scraping
product_details_scrapped_df = scrape_product_details(final_collated_urls)
#product_details_scrapped_df.to_csv('amazon_scrapped_data.csv', index=False)


In [54]:
product_details_scrapped_df.tail(n=100)

Unnamed: 0,Product URL,Title,SKU Product,Scraping Date,Scraping Time,Selling Price,MRP,Discount,no_ratings,avg_rating,Stock Status,Seller,Attribute_Name,Attribute_Value,Retailer
0,https://www.amazon.in/dp/B0BK1KS6ZD,Daikin 1.5 Ton 3 Star Inverter Split AC (Coppe...,B0BK1KS6ZD,2025-02-28,2025-02-28 23:51:11.993404,36990.0,58400.0,,"3,948 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DS2DX5ZP,VICARI-Portable-Air-Conditioner-Small-Ac-Quait...,B0DS2DX5ZP,2025-02-28,2025-02-28 23:51:57.451626,699.0,1299.0,,22 ratings,3.2,In stock,MI enterprise's,,,Amazon
0,https://www.amazon.in/dp/B0DQQ4XDBB,LG 1.5 Ton 3 Star DUAL Inverter Split AC (Copp...,B0DQQ4XDBB,2025-02-28,2025-02-28 23:52:54.681830,37690.0,78990.0,,960 ratings,4.0,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0CWVDXYX1,"Voltas 1.5 ton 3 Star, Inverter Split AC (Copp...",B0CWVDXYX1,2025-02-28,2025-02-28 23:53:43.147620,33990.0,,,"3,140 ratings",3.8,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DRG7M72Z,LG 1.5 Ton 5 Star DUAL Inverter Split AC (Copp...,B0DRG7M72Z,2025-02-28,2025-02-28 23:54:37.725124,46990.0,85990.0,,"1,316 ratings",4.0,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DSWR3DJ3,Carrier 1.5 Ton 3 Star Wi-Fi Smart Flexicool I...,B0DSWR3DJ3,2025-02-28,2025-02-28 23:55:34.247990,35490.0,68790.0,,"1,563 ratings",4.1,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B09R4RYCJ4,Daikin 1.5 Ton 5 Star Inverter Split AC (Coppe...,B09R4RYCJ4,2025-02-28,2025-02-28 23:56:25.091835,45490.0,67200.0,,"2,659 ratings",3.9,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DP5N2VTR,"Blue Star 1.5 Ton 5 Star, 60 Months Warranty, ...",B0DP5N2VTR,2025-02-28,2025-02-28 23:57:09.825210,43990.0,75000.0,,558 ratings,4.0,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DSVRY811,Panasonic 1.5 Ton 5 Star Premium Wi-Fi Inverte...,B0DSVRY811,2025-02-28,2025-02-28 23:57:55.610196,44990.0,64400.0,,"3,902 ratings",4.0,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
0,https://www.amazon.in/dp/B0DSXJFF5H,Carrier 1.5 Ton 5 Star Wi-Fi Smart Flexicool I...,B0DSXJFF5H,2025-02-28,2025-02-28 23:58:44.448813,42990.0,76090.0,,"1,661 ratings",4.1,In stock,DAWNTECH ELECTRONICS PRIVATE LIMITED,,,Amazon
