In [None]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import time
from datetime import date, datetime
import numpy as np
import random
import decimal
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# Configuration and utility functions
def setup_config():
    """Set up working directory and return configuration parameters"""
    os.chdir("D:/projects/scraping_code_send_adhoc/")
    return 1  # total_pages

def load_amazon_meta_urls():
    """Load Amazon meta URLs from CSV file"""
    return pd.read_csv('amazon_meta_urls.csv', sep=',')

def init_browser():
    """Initialize and return a configured Chrome browser instance"""
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--incognito")
    options.add_argument("start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    return Chrome(options=options)

def delete_cache(driver):
    """Clear browser cache"""
    driver.execute_script("window.open('')")
    driver.switch_to.window(driver.window_handles[-1])
    driver.get('chrome://settings/clearBrowserData')
    perform_actions(driver, Keys.TAB * 2 + Keys.DOWN * 4 + Keys.TAB * 5 + Keys.ENTER)
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

def perform_actions(driver, keys):
    """Perform keyboard actions"""
    actions = ActionChains(driver)
    actions.send_keys(keys)
    time.sleep(1.5)
    actions.perform()

# ASIN Scraping functions
def scrape_asin_urls(amazon_listing_url, total_pages):
    """Main function to scrape ASINs from listing pages"""
    final_collated_urls = pd.DataFrame()
    
    for base_url in amazon_listing_url['landing page url'].unique().tolist():
        all_product_urls = process_base_url(base_url, total_pages)
        final_collated_urls = pd.concat([final_collated_urls, all_product_urls])
        final_collated_urls = final_collated_urls.drop_duplicates('asin')
    
    final_collated_urls['Product URL'] = 'https://www.amazon.in/dp/' + final_collated_urls['asin'].astype(str)
    final_collated_urls = pd.merge(final_collated_urls, amazon_listing_url, 
                                 left_on=['listing_url'], right_on=['landing page url'], how='left')
    return final_collated_urls

def process_base_url(base_url, total_pages):
    """Process all pages for a single base URL"""
    all_product_urls = pd.DataFrame()
    
    for page in range(1, total_pages + 1):
        url = f"{base_url}&page={page}"
        browser = init_browser()
        browser.get(url)
        html = browser.page_source
        delete_cache(browser)
        browser.quit()
        time.sleep(random.uniform(0.5, 1.05))
        
        soup = BeautifulSoup(html, 'html.parser')
        page_asin_df = extract_asin_from_page(soup)
        page_asin_df['listing_url'] = base_url
        all_product_urls = pd.concat([all_product_urls, page_asin_df])
    
    return all_product_urls.reset_index(drop=True)

def extract_asin_from_page(soup):
    """Extract ASINs from a single page"""
    product_urls = []
    for div in soup.find_all("div"):
        if (data_asin := div.get("data-asin")) and (data_index := div.get("data-index")):
            product_urls.append({'asin': data_asin, 'product default order number': data_index})
    return pd.DataFrame(product_urls)

# Product Detail Scraping functions
def scrape_product_details(final_collated_urls):
    """Main function to scrape product details from product pages"""
    final_scrapped_df = pd.DataFrame()
    
    for product_url in final_collated_urls['Product URL'].tolist():
        product_df = scrape_single_product(product_url)
        product_df['Category'] = final_collated_urls.loc[
            final_collated_urls['Product URL'] == product_url, 'Category'].values[0]
        product_df['Retailer'] = 'Amazon'
        final_scrapped_df = pd.concat([final_scrapped_df, product_df])
    
    return final_scrapped_df

def scrape_single_product(product_url):
    """Scrape details from a single product page"""
    product_df = pd.DataFrame({'Product URL': [product_url]})
    try:
        browser = init_browser()
        browser.get(product_url)
        html = browser.page_source
        delete_cache(browser)
        browser.quit()
        time.sleep(random.uniform(0.5, 1.05))
        
        soup = BeautifulSoup(html, 'html.parser')
        product_df = extract_basic_info(soup, product_df)
        product_df = extract_pricing_info(soup, product_df)
        product_df = extract_ratings_info(soup, product_df)
        product_df = extract_additional_info(soup, product_df)
        product_df = extract_technical_details(soup, product_df)
        
    except Exception as e:
        print(f"Error scraping {product_url}: {str(e)}")
    
    return product_df

def extract_basic_info(soup, df):
    """Extract basic product information"""
    try: df['Title'] = soup.find("span", {'id': 'productTitle'}).text.strip()
    except: df['Title'] = ''
    
    try: df['SKU Product'] = soup.find('input', {'id': 'ASIN'}).get('value')
    except: df['SKU Product'] = ''
    
    df['Scraping Date'] = date.today()
    df['Scraping Time'] = datetime.now()
    return df

def extract_pricing_info(soup, df):
    """Extract pricing-related information"""
    try:
        price = soup.find('span', {'class': 'a-price-whole'}).text.strip()
        df['Selling Price'] = float(re.sub(r'[^\d.]', '', price))
    except: df['Selling Price'] = ''
    
    try:
        mrp = soup.find("span", class_="a-price a-text-price").find("span", class_="a-offscreen").text
        df['MRP'] = float(re.sub(r'[^\d.]', '', mrp))
    except: df['MRP'] = ''
    
    try: df['Discount'] = soup.find("span", class_="a-size-large a-color-price").text.strip().replace("-", "")
    except: df['Discount'] = ''
    
    return df

def extract_ratings_info(soup, df):
    """Extract rating-related information"""
    try: df['no_ratings'] = soup.find("span", {'id': 'acrCustomerReviewText'}).text.strip()
    except: df['no_ratings'] = ''
    
    try: df['avg_rating'] = float(soup.find("span", {'class': 'reviewCountTextLinkedHistogram'}).text.split()[0])
    except: df['avg_rating'] = ''
    
    return df

def extract_additional_info(soup, df):
    """Extract additional product information"""
    try: df['Stock Status'] = soup.find("div", {'id': 'availabilityInsideBuyBox_feature_div'}).text.strip()
    except: df['Stock Status'] = ''
    
    try: df['Seller'] = soup.find("a", {'id': 'sellerProfileTriggerId'}).text.strip()
    except: df['Seller'] = ''
    
    return df

def extract_technical_details(soup, df):
    """Extract technical details from product tables"""
    tables = [
        ('productOverview_feature_div', 'a-normal a-spacing-micro'),
        ('productDetails_detailBullets_sections1', None),
        ('productDetails_techSpec_section_1', None)
    ]
    
    for table_id, table_class in tables:
        try:
            table = soup.find('div', {'id': table_id}).find('table', class_=table_class)
            temp_df = pd.DataFrame([
                [th.text.strip(), td.text.strip()] 
                for tr in table.find_all('tr') 
                for th, td in zip(tr.find_all('th'), tr.find_all('td'))
            ], columns=['Attribute_Name', 'Attribute_Value'])
            
            df = pd.concat([df, temp_df], axis=1).ffill()
        except: continue
    
    return df

# Main execution flow
def main():
    total_pages = setup_config()
    amazon_listing_url = load_amazon_meta_urls()
    
    # Define unused but original variables
    amazon_urls = {'https://www.amazon.in/s?rh=n%3A3474656031&fs=true', ...}  # shortened for brevity
    category = {'AC', 'Air Purifier', ...}  # shortened for brevity
    
    # ASIN Scraping
    final_collated_urls = scrape_asin_urls(amazon_listing_url, total_pages)
    final_collated_urls.to_csv('product_urls_amazon.csv', index=False)
    
    # Product Detail Scraping
    final_scrapped_df = scrape_product_details(final_collated_urls)
    final_scrapped_df.to_csv('amazon_scrapped_data.csv', index=False)

if __name__ == "__main__":
    main()