In [2]:
pip install selenium

Collecting selenium
  Downloading selenium-4.20.0-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.25.0-py3-none-any.whl.metadata (8.7 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.20.0-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB 991.0 kB/s eta 0:00:1

In [55]:
import time
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import random
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

In [66]:
class Amazon_Scrapper():
    def __init__(self):
        self.url = 'https://www.amazon.in'
        self.driver = self.init_driver(False)


    ## seting-up chrome webdriver
    def init_driver(self,headless, proxy=None):
        """Initialize an instance of Chrome Driver"""
        chrome_options = webdriver.ChromeOptions()
        if headless is True:
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--headless')
        if proxy is not None:
            chrome_options.add_argument(f'--proxy-server={proxy}')

        chrome_options.add_argument("--start-maximized")
        chrome_options.arguments.extend(["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "javascript.enabled"])
   
        driver = webdriver.Chrome(options=chrome_options)
        #driver.set_page_load_timeout(50)
        driver.get(self.url)
        return driver


    def search_keyword(self, keyword):
        search_box = self.driver.find_element(By.XPATH, "//input[@id='twotabsearchtextbox']")
        search_box.clear()
        search_box.send_keys(keyword)
        search_box.send_keys(Keys.RETURN)
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".s-result-list")))
        time.sleep(2) 


    def send_keys(self, element, keys):
        try:
            element.send_keys(keys)
        except NoSuchElementException:
            print("Element not found. Please check if the page structure has changed!!")


    def extract_product_links(self):
        product_links = self.driver.find_elements(By.XPATH, "//h2[@class='a-size-mini a-spacing-none a-color-base s-line-clamp-3']//a[contains(@class, 'a-link-normal')]")
        links = [link.get_attribute("href") for link in product_links]
        print(links)
        return links



    def scrape_search_results(self, max_pages=3):
        all_product_links = []
        for _ in range(max_pages):
            product_links = self.extract_product_links()
            all_product_links.extend(product_links)
            try:
                next_page = self.driver.find_element(By.XPATH , '//a[@class="s-pagination-item s-pagination-button"]')
            except NoSuchElementException:
                break
            next_page.click()
            time.sleep(6)
        print(all_product_links)

        for product_link in product_links:
                product_info = self.scrape_product_info(product_link)
                print(product_info)
                self.write_to_csv(product_info)

    def write_to_csv(self, product_info):
        fieldnames = ['Product Title', 'Listing Type (Sponsored/ Organic)', 'Avg. Star Rating', 'Quantities bought in the last month', 'Special deal running', 'MRP of the product', 'Selling Price', 'Presence of a Bestseller/Amazon’s Choice Tag', 'Date of Delivery']

        with open('products.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)

            # Check if the file is empty and write header only if needed
            if file.tell() == 0:
                writer.writeheader()

            writer.writerow(product_info)

    def scrape_product_info(self, product_url):
        self.driver.get(product_url)
        print("Product scrapping started!")
        print(product_url)
        
        # Extract product information
        product_info = {}
        try:
            product_info['Product Title'] = self.driver.find_element(By.ID, 'productTitle').text
            print("Product Title:", product_info['Product Title'])
        except Exception as e:
            print("Product title error 'cuz of : ", e)
            product_info['Product Title'] = 'N/A'
        try:
            product_info['Listing Type (Sponsored/ Organic)'] = self.driver.find_element(By.XPATH, "//span[@class ='sp_detail_thematic-prime_theme_for_non_prime_members_sponsored_label']")
            print("Listing Type: ", product_info['Listing Type (Sponsored/ Organic)'] )
        except Exception as e:
            product_info['Listing Type (Sponsored/ Organic)'] = 'N/A'
        # try:
        #     product_info['Page Number'] = self.driver.find_elements(By.XPATH, "//span[@class='a-carousel-page-current']")
        #     if product_info['Page Number']:
        #         current_page_number = product_info['Page Number'][0].text
        #         print("Page Number: ", current_page_number)
        #     else:
        #         print("No current page number found.")
        # except Exception as e:
        #     print("Page number error 'cuz of : ", e)   


        try:
            star_rating_element = self.driver.find_elements(By.XPATH, "//a//span[@class='a-size-base a-color-base']")[0]
            product_info['Avg. Star Rating'] = star_rating_element.text
            print("Avg star rating: ", product_info['Avg. Star Rating'])
        except Exception as e:
            print("Avg star rating not found:", e)
        try:
        #     product_info['Total Rating'] = self.driver.find_elements(By.XPATH, "//span[@id='acrCustomerReviewText']")
        #     if product_info['Total Rating']:
        #         Total_Rating = product_info['Total Rating'][1]
        #         print("Total rating: ", Avg_Star_Rating)
        #     else:
        #         print("Total rating not found!")
        # except exception as e:
        #     print("Total rating error 'cuz of : ", e)
            review_element = self.driver.find_element(By.ID, "acrCustomerReviewText")
            review_text = review_element.text
            # Split the text to extract only the numeric part
            rating = review_text.split()[0]
            print("Total Ratings:", rating)
        except NoSuchElementException:
            print("Rating element not found")
        try:
            product_info['Quantities bought in the last month'] = self.driver.find_element(By.XPATH, "//span[@class='a-size-small social-proofing-faceout-title-text']//span").text
            print("Last month quantities bought: ", product_info['Quantities bought in the last month'])
        except Exception as e:
            product_info['Quantities bought in the last month'] = 'N/A'

        try:
            product_info['Special deal running']= self.driver.find_element(By.XPATH, "//span[@class='a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage']").text
            print("Special deal running: ", product_info['Special deal running'])
        except NoSuchElementException:
            product_info['Special deal running'] = 'N/A'   

        try:
            product_info['MRP of the product'] = self.driver.find_element(By.XPATH, "//span[@class='a-size-small a-color-secondary aok-align-center basisPrice']//span[@class='a-price a-text-price']//span[@class='a-offscreen']").text
            print("Product MRP: ", product_info['MRP of the product'])
        except Exception as e:
            print("MRP not found error 'cuz of : ", e)

        try:
            product_info['Selling Price'] = self.driver.find_element(By.XPATH, "//span[@class='a-price aok-align-center reinventPricePriceToPayMargin priceToPay']//span[@class='a-price-whole']").text
            print("Selling Price: ", product_info['Selling Price'])
        except Exception as e:
            print("Selling Price not found error 'cuz of : ", e)  

        try:
            product_info['Presence of a Bestseller/Amazon’s Choice Tag'] = self.driver.find_element(By.XPATH, "//span[@class='a-size-small aok-float-left ac-badge-rectangle']").text
            print("Bestseller Presence: ", product_info['Presence of a Bestseller/Amazon’s Choice Tag'])
        except Exception as e:
            product_info['Presence of a Bestseller/Amazon’s Choice Tag'] = 'N/A'


        try:
            delivery_date_element = self.driver.find_element(By.XPATH, "(//div[@class='a-spacing-base']//span[@class='a-text-bold'])[1]")
            product_info['Date of Delivery'] = delivery_date_element.text
            print("Delivery Date: ", product_info['Date of Delivery'])
        except Exception as e:
            print("Date of delivery not found:", e)

        return product_info if product_info else None
            
if __name__ == "__main__":
    start_time = time.time()
    class_caller = Amazon_Scrapper()
    print("start time", start_time)
    keywords = ["Lipstick for women", "Shampoo for women", "Face wash", "Baby Shampoo", "Face Serum"]
    for keyword in keywords:
        print("######################################################################################")
        print(f"Searching for: {keyword}") 
        print("######################################################################################")
        class_caller.search_keyword(keyword)
        # class_caller.click_all_products()
        class_caller.scrape_search_results()
        # print(product_links)
        # class_caller.driver.quit()

start time 1715523007.1909091
######################################################################################
Searching for: Lipstick for women
######################################################################################
['https://www.amazon.in/sspa/click?ie=UTF8&spc=MToxMTg0MjY4NDY2NjY0MTMyOjE3MTU1MjMwMTM6c3BfYXRmOjMwMDEzMTYzNzc3NzMzMjo6MDo6&url=%2FMyGlamm-Plumping-Lipstick-Buildable-Moisturising%2Fdp%2FB0CSNN6SMP%2Fref%3Dsr_1_1_sspa%3Fdib%3DeyJ2IjoiMSJ9.rXsZ-lPBP44U0LQW9jm1UEmEfq7z92o0UxRgsjkI9qx75DwFuXWrXD0cA6j1hKPHHO1Ez8skxK0EjaZ-zP40Egtz3LnEjN9xl0RaGyZhSrhPBwA3FrnXWE027TLic4v7rXvfUZtCj26Xgyv8WGr-8yziFlWTL5F8fof6VoG2roU-iY03023NPDDNpf1Dq7PnXgoa3LQXJ_vpCcmwYUc1r6tv2boRdSNm07lnqzrEyreO-Ynpm5DjzRECYDwLUavnY3j7Aji5bEi1_diX-KoJfVBDGU9Fi65pqwxC9oylQhU._fDj7G-ASdwqpY1C8aoAkoLFK_esZOpIUyTm8xabLGQ%26dib_tag%3Dse%26keywords%3DLipstick%2Bfor%2Bwomen%26qid%3D1715523013%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1', 'https://www.amazon.in/sspa/click?ie=UTF8&spc