In [95]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import time
import pandas as pd
import re
from datetime import datetime

In [96]:
url = 'https://www.nykaa.com/search/result/?q=Diapers&root=search&searchType=Misc&suggestionType=category&ssp=2&tst=diapers&searchItem=Diapers&sourcepage=Category%20Page&'

# let us initiate our driver firstly
driver = webdriver.Chrome()
driver.get(url)

In [97]:
# Let us create an empty lists to store the data
data_dict = {
    'Product_Names':[],
    'Product_URLs':[],
    'Current_Price':[],
    'Original_Price':[],
    'Discount_Percentage':[],
    'Units_Sold':[],
    'Reviews':[] ,
    'Featured':[],
    'Best_Seller':[]
}
    
# Lets us define a function to retireve the information we need
def get_page_data(driver, data_dict):
    #suppose we want to selenium to wait for it to exist on the page.
    #Explict waits
    try:
        element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-d5z3ro')))
        search_results = driver.find_elements(By.CLASS_NAME, 'css-d5z3ro')
        
        for result in search_results:
            product_name = result.find_element(By.CLASS_NAME, 'css-xrzmfa').text
            product_url = result.find_element(By.TAG_NAME, 'a').get_attribute('href')
            current_price = result.find_element(By.CLASS_NAME, 'css-111z9ua').text
            current_price = current_price.replace('₹','')
            try:
                original_price = result.find_element(By.CLASS_NAME, 'css-17x46n5').text
                original_price = original_price.replace('MRP:','')
                if original_price.__contains__('₹'):
                    original_price = original_price.replace('₹','')
            except NoSuchElementException:
                original_price = "N/A"
            try:
                discount_percentage = result.find_element(By.CLASS_NAME,'css-cjd9an').text
                discount_percentage = discount_percentage.replace(' Off','')
            except NoSuchElementException:
                discount_percentage = "N/A"
            try:
                units_sold = result.find_element(By.CLASS_NAME,'_1cEkb').text
            except NoSuchElementException:
                units_sold = "N/A"
            try:
                reviews = result.find_element(By.CLASS_NAME,'css-1qbvrhp').text
            except NoSuchElementException:
                reviews = "N/A"
            try:
                featured = result.find_element(By.CLASS_NAME,'css-1jnild6').text
            except NoSuchElementException:
                featured = "N/A"
            try:
                best_seller = result.find_element(By.CLASS_NAME,'css-1bse542').text
            except NoSuchElementException:
                best_seller = "N/A"
            
         
            # Append the empty lists with the data we have
            data_dict['Product_Names'].append(product_name)
            data_dict['Product_URLs'].append(product_url)
            data_dict['Current_Price'].append(current_price)
            data_dict['Original_Price'].append(original_price)
            data_dict['Discount_Percentage'].append(discount_percentage)
            data_dict['Units_Sold'].append(units_sold)
            data_dict['Reviews'].append(reviews)
            data_dict['Featured'].append(featured)
            data_dict['Best_Seller'].append(best_seller)
    except NoSuchElementException:
        print("Some elements not found for this product.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Now let us create a loop that will go through all the pages
current_page = 1
while True:
    # Get data from the current page
    print(f"Processing page {current_page}...")
    try:
        get_page_data(driver, data_dict)
    except StaleElementReferenceException:
        print("StaleElementReferenceException occurred. Waiting for the page to load again...")
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-d5z3ro')))
        get_page_data(driver, data_dict)

    # Check if there is a next page link
    next_page_link = driver.find_elements(By.CLASS_NAME, 'css-1zi560')
    if len(next_page_link) == 0 or 'css-d5z3ro' in next_page_link[0].get_attribute('class'): 
        break
    else:
        # Click on the next page link to navigate to the next page
        # next_page_link[0].click()
        element = driver.find_element(By.CLASS_NAME,'css-1zi560')
        webdriver.ActionChains(driver).move_to_element(element ).click(element ).perform()
        # Wait for the new page to load
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-d5z3ro')))
        time.sleep(20)
    current_page += 1

# Create the DataFrame from the collected data
df = pd.DataFrame(data_dict)

Processing page 1...


In [98]:
# In[2]:


# Now let use manipulate our data
# Let us start by creating a function to extract product ID from the URL
def extract_product_id(product_url):
    match = re.search(r'productId=(\d+)', product_url)
    if match:
        return match.group(1)
    return None

# Apply the function to create the 'Product_ID' column
df['Product_ID'] = df['Product_URLs'].apply(extract_product_id)

df.reset_index(drop=True)


Unnamed: 0,Product_Names,Product_URLs,Current_Price,Original_Price,Discount_Percentage,Units_Sold,Reviews,Featured,Best_Seller,Product_ID
0,Himalaya Total Care Baby Pants Diapers,https://www.nykaa.com/himalaya-total-care-baby...,1020,1275.0,20%,,( 321 ),FEATURED,BESTSELLER,779466
1,Pampers New Baby Diapers - 24 Pack,https://www.nykaa.com/pampers-new-baby-diapers...,360,,,,( 53 ),FEATURED,BESTSELLER,9961
2,Huggies Wonder Pants Extra Small Size Diaper P...,https://www.nykaa.com/huggies-wonder-pants-ext...,212,249.0,15%,,( 49 ),,BESTSELLER,5886453
3,"Pampers New Diapers Pants, Small",https://www.nykaa.com/pampers-new-diapers-pant...,849,,,,( 58 ),,BESTSELLER,878128
4,"Pampers New Diapers Pants, XL",https://www.nykaa.com/pampers-new-diapers-pant...,1199,,,,( 17 ),,BESTSELLER,878131
5,"Pampers New Diapers Pants, Large",https://www.nykaa.com/pampers-new-diapers-pant...,899,,,,( 16 ),,BESTSELLER,878119
6,"Bambo Nature Premium Baby Diapers - XS Size, 2...",https://www.nykaa.com/bambo-nature-eco-friendl...,899,,,,( 7 ),,,781565
7,"Pampers Premium Care Pants Diapers, Small",https://www.nykaa.com/pampers-premium-care-pan...,1199,,,,( 1 ),,,363921
8,Himalaya Total Care Baby Pants Extra Large,https://www.nykaa.com/himalaya-total-care-baby...,820,1025.0,20%,,( 22 ),,,393973
9,Himalaya Total Care Baby Pants Large,https://www.nykaa.com/himalaya-total-care-baby...,760,950.0,20%,,( 24 ),,,393971


In [99]:
# In[3]:


#Lets convert the file into excel

current_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
filename = f"nykaa_scrape_{current_datetime}.xlsx"
df.to_excel(filename,index=False)

print("Process Ended Successfully :)")

Process Ended Successfully :)
