# Amazon Grocery Web Scraper (Selenium + Python)

### Setting up environment 
drivers and services
##### import selenium webdrivers and services
##### import chromedrivermanager
libraries
##### import time,numpy, pandas




In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import numpy as np
import pandas as pd

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.amazon.in")
time.sleep(2)
## create empty lists for extracting required data
Product_Name=[]
Price=[]
Rating=[]
Raters=[]

#send keyword (groceries) to search bar
enter_bar = driver.find_element(By.XPATH, "//input[@id='twotabsearchtextbox']")
enter_bar.send_keys("groceries")
#click on search button
search_btn = driver.find_element(By.XPATH, "//input[@id='nav-search-submit-button']").click()
time.sleep(2)


#Based on your interest sort the products . In this code(used sort option and selected "bestseller" )
sortby_btn = driver.find_element(By.XPATH, "//span[@class='a-button a-button-dropdown a-button-small']").click()
bestseller_btn = driver.find_element(By.XPATH, "//a[@id='s-result-sort-select_5']").click()
time.sleep(5)

# ADDING WHILE LOOP HERE
while True:
    products = driver.find_elements(By.XPATH, "//div[@class='a-section a-spacing-small puis-padding-left-small puis-padding-right-small']")
    time.sleep(2)

    for i in products:
        # Product Name
        try:
            name = i.find_element(By.XPATH, ".//a[@class='a-link-normal s-line-clamp-3 s-link-style a-text-normal']").text.strip()
            Product_Name.append(name)
            print("Name:", name)
        except:
            name = np.nan
            Product_Name.append(name)
            pass

        # Price
        try:
            price = i.find_element(By.XPATH, ".//span[@class='a-price-whole']").text.strip()
            Price.append(price)
            print("Price:", price)
        except:
            price = np.nan
            Price.append(price)
            pass

        # Rating
        try:
            rating = i.find_element(By.XPATH, ".//span[@class='a-size-small a-color-base']").text.strip()
            Rating.append(rating)
            print("Rating:", rating)
        except:
            rating = np.nan
            Rating.append(rating)
            pass

        # Raters
        try:
            number_of_ratings = i.find_element(By.XPATH, ".//span[@class='a-size-mini puis-normal-weight-text s-underline-text']").text.strip()
            Raters.append(number_of_ratings)
            print("Raters:", number_of_ratings)
        except:
            number_of_ratings = np.nan
            Raters.append(number_of_ratings)
            pass

#after every page this code will click on next page button untill it find"disabled" 
    try:
        whole_page = driver.find_element(By.XPATH, "//div[@id='search']")
        next_btn = whole_page.find_element(By.XPATH, "//a[contains(@class,'s-pagination-next')]")
        if "disabled" in next_btn.get_attribute("class"):
            print("Reached last page")
            break
        driver.execute_script("arguments[0].click();", next_btn)
        print("➡️ Moving to next page...")
        time.sleep(3)
    except:
        print(" No next page found — stopping.")
        break

driver.quit()



Name: Fresh Onion, 1kg
Price: 28
Name: Fresh Potato, 1kg
Name: Fresh Chilli - Green, 100g
Price: 7
Name: Fresh Mushroom, (Approx.180 -200g)
Price: 48
Name: Fresh Carrot - Orange, 500g Pack
Price: 32
Name: Fortune Premium Kachi Ghani Pure Mustard Oil, 910G PET Bottle (Weight May Vary)
Price: 299
Rating: 4.4
Raters: (34.9K)
Name: Fresh Ginger (Adrak), 100g
Price: 12
Name: Amul Salted Butter Pasteurised, 100 Grams
Price: 57
Rating: 4.6
Raters: (23.1K)
Name: MAGGI 2-Minute Instant Noodles, Masala Noodles With Goodness Of Iron, Made With Choicest Quality Spices, Favourite Masala Taste, 840/900g Pouch (Pack of 12, 70/75g each) (weight may vary)
Price: 143
Rating: 4.4
Raters: (71.8K)
Name: Fresh Tomato - Local, 1kg
Price: 40
Name: Fortune Premium Kachi Ghani Pure Mustard Oil, 1 ltr pouch
Price: 185
Rating: 4.4
Raters: (36.3K)
Name: Fresh Curry Leaves, 100 grams Bunch
Price: 8
Name: Parry's White Label Sugar, 1kg
Rating: 4.3
Raters: (21.6K)
Name: Fresh Bhendi (Lady Finger), 500g
Price: 36
Name

### Create Data Frame to start working on the extracted data 

In [17]:
# Create DataFrame
df = pd.DataFrame({
    "Product": Product_Name,
    "Price": Price,
    "Rating": Rating,
    "Raters": Raters
})

print(df.head())


                              Product Price Rating Raters
0                    Fresh Onion, 1kg    28    NaN    NaN
1                   Fresh Potato, 1kg   NaN    NaN    NaN
2          Fresh Chilli - Green, 100g     7    NaN    NaN
3  Fresh Mushroom, (Approx.180 -200g)    48    NaN    NaN
4    Fresh Carrot - Orange, 500g Pack    32    NaN    NaN


### copy the raw data frame to another data frame to work more 

In [18]:
# copy the raw data frame to another data frame to work more 

df1 = df.copy()

# Make sure the dataFrame copied sucessfully in to new data frame df1
print(df1.head())
print("New Data Frame size :",df1.size,"\nRaw or Original Data Frame size :",df.size)

                              Product Price Rating Raters
0                    Fresh Onion, 1kg    28    NaN    NaN
1                   Fresh Potato, 1kg   NaN    NaN    NaN
2          Fresh Chilli - Green, 100g     7    NaN    NaN
3  Fresh Mushroom, (Approx.180 -200g)    48    NaN    NaN
4    Fresh Carrot - Orange, 500g Pack    32    NaN    NaN
New Data Frame size : 468 
Raw or Original Data Frame size : 468


### Make sure the dataFrame copied sucessfully in to new data frame df1
you can use any other availble methods like shape etc


In [19]:
print(df1.head())
print("New Data Frame size :",df1.size,"\nRaw or Original Data Frame size :",df.size)

                              Product Price Rating Raters
0                    Fresh Onion, 1kg    28    NaN    NaN
1                   Fresh Potato, 1kg   NaN    NaN    NaN
2          Fresh Chilli - Green, 100g     7    NaN    NaN
3  Fresh Mushroom, (Approx.180 -200g)    48    NaN    NaN
4    Fresh Carrot - Orange, 500g Pack    32    NaN    NaN
New Data Frame size : 468 
Raw or Original Data Frame size : 468


### Small Glimpse of cleaning data
#### In this section only raw raters column with values like 2.5k , 1.5k converted in to 2500.00, 1500.00 


In [20]:

import regex as re
df1["Raters"]=df1["Raters"].apply(lambda x: re.findall(r"[0-9A-Za-z\.]+",str(x))[0])
df1["Raters"] = df1["Raters"].apply(lambda x: float(x.replace("K", "").replace("k", "")) * 1000 if "k" in x.lower() else float(x))

df1.tail(10)

Unnamed: 0,Product,Price,Rating,Raters
107,Happilo Premium Natural Arabian Dates 500g Pou...,126.0,3.5,2100.0
108,Daawat Rozana Gold Basmati Rice 5Kg| For Every...,409.0,4.2,9400.0
109,"Britannia Gobbles Bar Cake Fruity Fun, 110 g",25.0,4.3,4700.0
110,Sunfeast Farmlite 5 Seed Digestive Biscuit | H...,102.0,4.2,22300.0
111,"Cumin Powder | Catch Jeera Powder, 100g",69.0,4.4,15100.0
112,Daawat Rozana Super Basmati Rice 1Kg| For Ever...,85.0,4.0,28400.0
113,"Tata Sampann Chilli Powder with Natural Oils, ...",,4.4,9900.0
114,"Sunfeast Dark Fantasy Choco Fills, 460g Origin...",158.0,4.4,24500.0
115,Catch Kashmiri Chilli Powder | Kashmiri Lal Mi...,70.0,4.4,3500.0
116,D'lecta Processed Cheese Slices 200g – 10 Slices,80.0,4.3,4600.0
