In [None]:
# Required imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
import time
import pandas as pd
import random

# Enhanced Chrome options for better bot detection avoidance
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--disable-plugins")
options.add_argument("--disable-images")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("<add your user agent here>")

driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

# Initialize data storage
titles, prices, mrps, discounts, ratings, review_counts = [], [], [], [], [], []
brands, urls, asin_list, prime_flags, deliveries, availabilities = [], [], [], [], [], []

def extract_product_data(product_element):
    """Extract all data from a single product element with CORRECT selectors based on HTML"""
    try:
        # Title - Working correctly
        title = None
        try:
            title = product_element.find_element(By.CSS_SELECTOR, "h2 a span").text.strip()
        except:
            try:
                title = product_element.find_element(By.CSS_SELECTOR, "h2 span").text.strip()
            except:
                title = None
        
        # Price - Working correctly
        price = None
        try:
            price_element = product_element.find_element(By.CSS_SELECTOR, '.a-price')
            price_text = price_element.text.strip()
            if price_text:
                price = price_text.split('\n')[0].strip()
        except:
            try:
                price_container = product_element.find_element(By.CSS_SELECTOR, '[data-cy="price-recipe"]')
                price_element = price_container.find_element(By.CSS_SELECTOR, '.a-price')
                price_text = price_element.text.strip()
                if price_text:
                    price = price_text.split('\n')[0].strip()
            except:
                price = None
        
        # MRP - Working correctly
        mrp = None
        try:
            price_container = product_element.find_element(By.CSS_SELECTOR, '[data-cy="price-recipe"]')
            container_text = price_container.text
            
            lines = container_text.split('\n')
            for i, line in enumerate(lines):
                if "M.R.P:" in line or "MRP:" in line:
                    mrp_line = line.replace("M.R.P:", "").replace("MRP:", "").strip()
                    if mrp_line and '₹' in mrp_line:
                        mrp = mrp_line.split('(')[0].strip()
                        break
                    elif i + 1 < len(lines) and '₹' in lines[i + 1]:
                        mrp = lines[i + 1].split('(')[0].strip()
                        break
        except:
            try:
                mrp_element = product_element.find_element(By.CSS_SELECTOR, '.a-price.a-text-price .a-offscreen')
                mrp = mrp_element.text.strip()
            except:
                try:
                    mrp_element = product_element.find_element(By.CSS_SELECTOR, '[data-a-strike="true"] .a-offscreen')
                    mrp = mrp_element.text.strip()
                except:
                    mrp = None
        
        # Discount - Working correctly
        discount = None
        try:
            price_container = product_element.find_element(By.CSS_SELECTOR, '[data-cy="price-recipe"]')
            container_text = price_container.text
            lines = container_text.split('\n')
            for line in lines:
                if "%" in line and ("off" in line.lower() or "discount" in line.lower()):
                    discount = line.strip()
                    break
        except:
            try:
                discount = product_element.find_element(By.XPATH, ".//span[contains(text(), '%') and contains(text(), 'off')]").text.strip()
            except:
                discount = None
        
        # Rating - Working correctly
        rating = None
        try:
            rating_elem = product_element.find_element(By.CSS_SELECTOR, "[aria-label*='out of 5 stars']")
            rating = rating_elem.get_attribute('aria-label')
        except:
            try:
                rating = product_element.find_element(By.CSS_SELECTOR, "span.a-icon-alt").text.strip()
            except:
                try:
                    rating = product_element.find_element(By.XPATH, ".//span[contains(@aria-label, 'out of')]").get_attribute('aria-label')
                except:
                    rating = None
        
        # Review count - Working correctly
        reviews = None
        try:
            reviews = product_element.find_element(By.CSS_SELECTOR, "a[href*='#customerReviews'] span").text.strip()
        except:
            try:
                reviews = product_element.find_element(By.XPATH, ".//a[contains(@href, 'customerReviews')]//span").text.strip()
            except:
                try:
                    reviews = product_element.find_element(By.XPATH, ".//span[contains(text(), ',') and (contains(text(), 'rating') or contains(text(), 'review'))]").text.strip()
                except:
                    reviews = None
        
        # URL - FIXED extraction with multiple fallback strategies
        url = None
        try:
            # Strategy 1: h2 a (most common)
            url_element = product_element.find_element(By.CSS_SELECTOR, "h2 a")
            url = url_element.get_attribute("href")
        except:
            try:
                # Strategy 2: Any link with data-cy title-recipe
                url_element = product_element.find_element(By.CSS_SELECTOR, "[data-cy='title-recipe'] a")
                url = url_element.get_attribute("href")
            except:
                try:
                    # Strategy 3: Any link containing product URL pattern
                    url_element = product_element.find_element(By.CSS_SELECTOR, "a[href*='/dp/']")
                    url = url_element.get_attribute("href")
                except:
                    try:
                        # Strategy 4: Any link in the product container
                        url_element = product_element.find_element(By.CSS_SELECTOR, "a")
                        potential_url = url_element.get_attribute("href")
                        if potential_url and ('/dp/' in potential_url or '/gp/' in potential_url):
                            url = potential_url
                    except:
                        url = None
        
        # Ensure URL is complete
        if url and not url.startswith("http"):
            url = "https://www.amazon.in" + url
        
        # ASIN
        asin = product_element.get_attribute("data-asin")
        
        # Prime - Working correctly
        prime = False
        try:
            product_element.find_element(By.CSS_SELECTOR, ".a-icon-prime, [aria-label*='Prime']")
            prime = True
        except:
            pass
        
        # Delivery info - UPDATED based on HTML structure
        delivery = None
        try:
            # Based on HTML: Look for delivery text patterns
            delivery = product_element.find_element(By.XPATH, ".//span[contains(text(), 'Get it by') or contains(text(), 'FREE delivery') or contains(text(), 'delivery')]").text.strip()
        except:
            try:
                # Alternative: Look for specific delivery containers
                delivery = product_element.find_element(By.CSS_SELECTOR, "[data-cy='delivery-recipe'] span").text.strip()
            except:
                try:
                    # Look for shipping/delivery related text
                    delivery = product_element.find_element(By.XPATH, ".//span[contains(text(), 'Tomorrow') or contains(text(), 'Today') or contains(text(), 'Ships')]").text.strip()
                except:
                    try:
                        # Look in any span that mentions delivery timing
                        delivery = product_element.find_element(By.XPATH, ".//span[contains(text(), 'delivery') or contains(text(), 'Delivery')]").text.strip()
                    except:
                        delivery = None
        
        # Availability - UPDATED based on HTML structure  
        availability = None
        try:
            # Look for stock status indicators
            availability = product_element.find_element(By.XPATH, ".//span[contains(text(), 'In stock') or contains(text(), 'left in stock') or contains(text(), 'Only ') or contains(text(), 'Available')]").text.strip()
        except:
            try:
                # Look for shipping timeframes as availability indicators
                availability = product_element.find_element(By.XPATH, ".//span[contains(text(), 'Ships in') or contains(text(), 'Usually ships') or contains(text(), 'Temporarily out')]").text.strip()
            except:
                try:
                    # Look for availability containers
                    availability = product_element.find_element(By.CSS_SELECTOR, "[data-cy='availability-recipe'] span").text.strip()
                except:
                    try:
                        # Look for any availability-related text
                        availability = product_element.find_element(By.XPATH, ".//span[contains(text(), 'stock') or contains(text(), 'available') or contains(text(), 'unavailable')]").text.strip()
                    except:
                        # If no specific availability found, check if product has price (indicates it's available)
                        if price:
                            availability = "In stock"
                        else:
                            availability = None
        
        # Brand from title - Working correctly
        brand = None
        if title:
            brand = title.split()[0] if title.split() else None
        
        return {
            'title': title,
            'price': price,
            'mrp': mrp,
            'discount': discount,
            'rating': rating,
            'reviews': reviews,
            'brand': brand,
            'url': url,
            'asin': asin,
            'prime': prime,
            'delivery': delivery,
            'availability': availability
        }
    
    except Exception as e:
        print(f"Error extracting product data: {e}")
        return None

# Start scraping
base_url = "https://www.amazon.in/s?i=electronics&rh=n%3A1388921031&s=popularity-rank&fs=true"
driver.get(base_url)
time.sleep(random.uniform(3, 5))

page_count = 0
max_pages = 150  # Target 150 pages for ~4,000 products

print(f"Starting scraping process. Target: {max_pages} pages")
print(f"Expected products: ~{max_pages * 27} products")
print(f"Estimated time: 2-3 hours")

while page_count < max_pages:
    page_count += 1
    print(f"Scraping page {page_count}/{max_pages} ({(page_count/max_pages)*100:.1f}% complete)")
    
    # Wait for products to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@data-component-type="s-search-result"]'))
        )
    except TimeoutException:
        print("Products not found on page, breaking...")
        break
    
    # Random delay
    time.sleep(random.uniform(2, 4))
    
    # Find all products on current page
    products = driver.find_elements(By.XPATH, '//div[@data-component-type="s-search-result"]')
    print(f"Found {len(products)} products on page {page_count}")
    
    for i, product in enumerate(products):
        try:
            # Re-find elements to avoid stale reference
            products_fresh = driver.find_elements(By.XPATH, '//div[@data-component-type="s-search-result"]')
            if i < len(products_fresh):
                product_data = extract_product_data(products_fresh[i])
                
                if product_data:
                    titles.append(product_data['title'])
                    prices.append(product_data['price'])
                    mrps.append(product_data['mrp'])
                    discounts.append(product_data['discount'])
                    ratings.append(product_data['rating'])
                    review_counts.append(product_data['reviews'])
                    brands.append(product_data['brand'])
                    urls.append(product_data['url'])
                    asin_list.append(product_data['asin'])
                    prime_flags.append(product_data['prime'])
                    deliveries.append(product_data['delivery'])
                    availabilities.append(product_data['availability'])
        
        except StaleElementReferenceException:
            print(f"Stale element on product {i}, skipping...")
            continue
        except Exception as e:
            print(f"Error processing product {i}: {e}")
            continue
    
    # Progress update every 10 pages
    if page_count % 10 == 0:
        current_products = len(titles)
        print(f"Progress Update: {page_count} pages completed, {current_products} products scraped")
        
        # Save intermediate results every 25 pages
        if page_count % 25 == 0:
            temp_df = pd.DataFrame({
                "Title": titles,
                "Brand": brands,
                "Price": prices,
                "MRP": mrps,
                "Discount": discounts,
                "Rating": ratings,
                "Review Count": review_counts,
                "Prime": prime_flags,
                "Delivery Info": deliveries,
                "Availability": availabilities,
                "ASIN": asin_list,
                "URL": urls
            })
            
            import datetime
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            temp_filename = f"amazon_electronics_backup_{page_count}pages_{timestamp}.csv"
            temp_df.to_csv(temp_filename, index=False)
            print(f"Backup saved: {temp_filename}")
    
    # Try to go to next page
    try:
        next_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.s-pagination-next, a[aria-label='Go to next page']"))
        )
        
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        time.sleep(1)
        next_button.click()
        time.sleep(random.uniform(3, 5))
        
    except (TimeoutException, NoSuchElementException):
        print("No more pages found.")
        break
    except Exception as e:
        print(f"Error navigating to next page: {e}")
        break

# Create final DataFrame and save
df = pd.DataFrame({
    "Title": titles,
    "Brand": brands,
    "Price": prices,
    "MRP": mrps,
    "Discount": discounts,
    "Rating": ratings,
    "Review Count": review_counts,
    "Prime": prime_flags,
    "Delivery Info": deliveries,
    "Availability": availabilities,
    "ASIN": asin_list,
    "URL": urls
})

# Save with timestamp
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"amazon_electronics_final_{timestamp}.csv"
df.to_csv(filename, index=False)

print(f"\n=== SCRAPING COMPLETE ===")
print(f"Pages scraped: {page_count}")
print(f"Total products: {len(df)}")
print(f"Final file saved: {filename}")
print(f"\nData summary:")
print(f"- Titles found: {sum(1 for x in titles if x)}")
print(f"- Prices found: {sum(1 for x in prices if x)}")
print(f"- MRPs found: {sum(1 for x in mrps if x)}")
print(f"- Discounts found: {sum(1 for x in discounts if x)}")
print(f"- Ratings found: {sum(1 for x in ratings if x)}")
print(f"- URLs found: {sum(1 for x in urls if x)}")
print(f"- Delivery info found: {sum(1 for x in deliveries if x)}")
print(f"- Availability found: {sum(1 for x in availabilities if x)}")

# Show first few entries for debugging
print(f"\nFirst 5 entries:")
for i in range(min(5, len(titles))):
    print(f"  {i+1}: Price={prices[i]}, MRP={mrps[i]}, Delivery={deliveries[i]}, Availability={availabilities[i]}")

# Close driver
driver.quit()


Starting scraping process. Target: 150 pages
Expected products: ~4050 products
Estimated time: 2-3 hours
Scraping page 1/150 (0.7% complete)
Found 27 products on page 1
Scraping page 2/150 (1.3% complete)
Found 27 products on page 2
Scraping page 3/150 (2.0% complete)
Found 27 products on page 3
Scraping page 4/150 (2.7% complete)
Found 27 products on page 4
Scraping page 5/150 (3.3% complete)
Found 27 products on page 5
Scraping page 6/150 (4.0% complete)
Found 27 products on page 6
Scraping page 7/150 (4.7% complete)
Found 27 products on page 7
Scraping page 8/150 (5.3% complete)
Found 27 products on page 8
Scraping page 9/150 (6.0% complete)
Found 27 products on page 9
Scraping page 10/150 (6.7% complete)
Found 27 products on page 10
Progress Update: 10 pages completed, 270 products scraped
Scraping page 11/150 (7.3% complete)
Found 27 products on page 11
Scraping page 12/150 (8.0% complete)
Found 27 products on page 12
Scraping page 13/150 (8.7% complete)
Found 27 products on page 