In [1]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
import pandas as pd

## Get all product links

In [3]:
"""
This script uses Selenium WebDriver to automate searching for "sports shoes for women" on Flipkart,
collects pagination links for the first 25 pages, extracts product detail page links,
and saves them into a CSV file.

Key Steps:
- Start Chrome browser session.
- Search for a query.
- Navigate through paginated result pages.
- Scrape product detail page URLs.
- Save all collected links to a CSV.
"""


# Define search query and website link
search_box_text = 'sports shoes for women'
website_link = 'https://www.flipkart.com/'

# ------------------------------------------------------------
# 1. Start the browser session and note session start time
# ------------------------------------------------------------
session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} ---------------------------> ")

# Initialize Chrome WebDriver
driver = webdriver.Chrome()

# Navigate to Flipkart's website
driver.get(website_link)

# Maximize the browser window for better visibility and interaction
driver.maximize_window()

# ------------------------------------------------------------
# 2. Locate the search input box and enter search term
# ------------------------------------------------------------
print('Waiting for search input...')

# Wait up to 120 seconds for the search input box to be present
search_input = WebDriverWait(driver, 120).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '[autocomplete="off"]'))
)

print('Typing in search input...')

# Enter the defined search term into the search box
search_input.send_keys(search_box_text)

print('Submitting search form...')

# Simulate pressing Enter to submit the search form
search_input.send_keys(Keys.RETURN)

# ------------------------------------------------------------
# 3. Wait for the search results page to load
# ------------------------------------------------------------
print('Waiting for search results...')

# Wait until at least one product link (which opens in new tab) appears on page
WebDriverWait(driver, 120).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '[target="_blank"]'))
)

# ------------------------------------------------------------
# 4. Collect pagination links for first 25 pages
# ------------------------------------------------------------
print('Collecting pagination links...')

"""
Logic:
- Get the link for the first page from the pagination bar.
- Derive links for pages 2 to 25 by modifying the page number in the URL.
"""

all_pagination_links = []

# Find the first pagination link
first_page = driver.find_elements(By.CSS_SELECTOR, 'nav a')[0]
first_page_link = first_page.get_attribute('href')

# Add first page link to the list
all_pagination_links.append(first_page_link)

# Generate links for pages 2 to 25 by modifying the page number at the end
for i in range(2, 26):
    new_pagination_link = first_page_link[:-1] + str(i)
    all_pagination_links.append(new_pagination_link)

print('Pagination Links Count:', len(all_pagination_links))
print('All Pagination Links: ', all_pagination_links)

# ------------------------------------------------------------
# 5. Visit each pagination link and collect product detail page links
# ------------------------------------------------------------
print('Collecting Product Detail Page Links')
all_product_links = []

# Loop through each pagination link
for link in all_pagination_links:
    driver.get(link)
    
    # Wait until page is fully loaded
    WebDriverWait(driver, 120).until(
        lambda d: d.execute_script('return document.readyState') == 'complete'
    )
    
    # Wait until product elements with class 'rPDeLR' are present
    WebDriverWait(driver, 120).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'rPDeLR'))
    )
    
    # Find all product link elements on the page
    all_products = driver.find_elements(By.CLASS_NAME, 'rPDeLR')
    
    # Extract the 'href' attributes (product detail page URLs)
    all_links = [element.get_attribute('href') for element in all_products]
    
    print(f"{link} Done ------>")
    
    # Add collected links to the master list
    all_product_links.extend(all_links)

print('All Product Detail Page Links Captured: ', len(all_product_links))

# ------------------------------------------------------------
# 6. Create DataFrame, remove duplicates, and save to CSV
# ------------------------------------------------------------
# Create a pandas DataFrame from the list of product URLs
df_product_links = pd.DataFrame(all_product_links, columns=['product_links'])

# Remove any duplicate URLs to ensure uniqueness
df_product_links = df_product_links.drop_duplicates(subset=['product_links'])

print('Total Unique Product Detail Page Links', len(df_product_links))

# Save the DataFrame as a CSV file
df_product_links.to_csv('flipkart_product_links.csv', index=False)

# ------------------------------------------------------------
# 7. Close the browser session and record session end time
# ------------------------------------------------------------
driver.close()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} ---------------------------> ")

Session Start Time: 15:45:08.686772 ---------------------------> 
Waiting for search input...
Typing in search input...
Submitting search form...
Waiting for search results...
Collecting pagination links...
Pagination Links Count: 25
All Pagination Links:  ['https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=1', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=2', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=3', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=4', 'https://www.flipkart.com/search?q=sports+shoes+for+women&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=5', 'https://www.flipkart.com/search?q=sports+

## Get individual product information

In [4]:
"""
This script reads a CSV containing Flipkart product detail page links,
visits each product page using Selenium WebDriver, extracts product information
(brand, title, price, discount, ratings), handles unavailable products,
removes duplicates, and saves the results to CSV files.
"""


# ------------------------------------------------------------
# 1. Start the scraping session and log session start time
# ------------------------------------------------------------
session_start_time = datetime.now().time()
print(f"Session Start Time: {session_start_time} ---------------------------> ")


# ------------------------------------------------------------
# 2. Read the CSV file containing all collected product links
# ------------------------------------------------------------
df_product_links = pd.read_csv("flipkart_product_links.csv")

# For demonstration purposes, limit to first 10 products.
# Remove the below line to scrape all products.
df_product_links = df_product_links.head(10)

# Convert DataFrame column to a list of URLs
all_product_links = df_product_links['product_links'].tolist()
print("Collecting Individual Product Detail Information")


# ------------------------------------------------------------
# 3. Start the Selenium WebDriver
# ------------------------------------------------------------
driver = webdriver.Chrome()

# Initialize lists and counters for storing results
complete_product_details = []
unavailable_products = []
successful_parsed_urls_count = 0
complete_failed_urls_count = 0

# ------------------------------------------------------------
# 4. Loop through each product page link and scrape details
# ------------------------------------------------------------
for product_page_link in all_product_links:
    try:
        # Navigate to the product page
        driver.get(product_page_link)

        # Wait for the page to fully load
        WebDriverWait(driver, 120).until(
            lambda d: d.execute_script('return document.readyState') == 'complete'
        )

        # Wait until at least one targetable link is present
        WebDriverWait(driver, 120).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[target="_blank"]'))
        )

        # ----------------------------------------------------
        # 4a. Check if the product is marked as unavailable
        # ----------------------------------------------------
        try:
            product_status = driver.find_element(By.CLASS_NAME, 'Z8JjpR').text
            if product_status in ['Currently Unavailable', 'Sold Out']:
                unavailable_products.append(product_page_link)
                successful_parsed_urls_count += 1
                print(f"URL {successful_parsed_urls_count} completed --->")
                continue
        except:
            # No status means the product is likely available; continue
            pass

        # ----------------------------------------------------
        # 4b. Extract product details
        # ----------------------------------------------------

        # Brand
        brand = driver.find_element(By.CLASS_NAME, 'mEh187').text

        # Title - remove parenthetical color info
        title = driver.find_element(By.CLASS_NAME, 'VU-ZEz').text
        title = re.sub(r'\s*\([^)]*\)', '', title)

        # Price - extract digits only
        price = driver.find_element(By.CLASS_NAME, 'Nx9bqj').text
        price = ''.join(re.findall(r'\d+', price))

        # Discount - optional field, may be missing
        try:
            discount_text = driver.find_element(By.CLASS_NAME, 'UkUFwK').text
            discount_numbers = re.findall(r'\d+', discount_text)
            discount = ''.join(discount_numbers)
            discount = int(discount) / 100
        except:
            discount = ''

        # Ratings and Reviews - optional fields
        try:
            product_review_status = driver.find_element(By.CLASS_NAME, 'E3XX7J').text
            if product_review_status == 'Be the first to Review this product':
                avg_rating = ''
                total_ratings = ''
        except:
            try:
                avg_rating = driver.find_element(By.CLASS_NAME, 'XQDdHH').text
                total_ratings = driver.find_element(By.CLASS_NAME, 'Wphh3N').text.split(' ')[0]
                # Remove commas in ratings count
                total_ratings = int(total_ratings.replace(',', '')) if ',' in total_ratings else int(total_ratings)
            except:
                avg_rating = ''
                total_ratings = ''

        # Append the collected data to the results list
        complete_product_details.append([
            product_page_link, title, brand, price, discount, avg_rating, total_ratings
        ])

        # Increment and log success counter
        successful_parsed_urls_count += 1
        print(f"URL {successful_parsed_urls_count} completed *******")

    except Exception as e:
        # Handle any failures in accessing or parsing the page
        print(f"Failed to establish a connection for URL {product_page_link}:  {e}")
        unavailable_products.append(product_page_link)
        complete_failed_urls_count += 1
        print(f"Failed URL Count {complete_failed_urls_count}")


# ------------------------------------------------------------
# 5. Create pandas DataFrames for results
# ------------------------------------------------------------

# DataFrame for successfully scraped product details
df = pd.DataFrame(
    complete_product_details,
    columns=['product_link', 'title', 'brand', 'price', 'discount', 'avg_rating', 'total_ratings']
)

# Identify and store duplicates (for inspection)
df_duplicate_products = df[df.duplicated(subset=['brand', 'price', 'discount', 'avg_rating', 'total_ratings'])]

# Drop duplicates to keep only unique products
df = df.drop_duplicates(subset=['brand', 'price', 'discount', 'avg_rating', 'total_ratings'])

# DataFrame for unavailable or failed URLs
df_unavailable_products = pd.DataFrame(unavailable_products, columns=['link'])


# ------------------------------------------------------------
# 6. Print summary statistics
# ------------------------------------------------------------
print("Total product pages scrapped: ", len(all_product_links))
print("Final Total Products: ", len(df))
print("Total Unavailable Products : ", len(df_unavailable_products))
print("Total Duplicate Products: ", len(df_duplicate_products))


# ------------------------------------------------------------
# 7. Save the data to CSV files
# ------------------------------------------------------------
df.to_csv('flipkart_product_data.csv', index=False)
df_unavailable_products.to_csv('unavailable_products.csv', index=False)
df_duplicate_products.to_csv('duplicate_products.csv', index=False)


# ------------------------------------------------------------
# 8. Close the browser session and log end time
# ------------------------------------------------------------
driver.close()
session_end_time = datetime.now().time()
print(f"Session End Time: {session_end_time} ---------------------------> ")

Session Start Time: 15:57:06.678946 ---------------------------> 
Collecting Individual Product Detail Information
URL 1 completed *******
URL 2 completed *******
URL 3 completed *******
URL 4 completed *******
URL 5 completed *******
URL 6 completed *******
URL 7 completed *******
URL 8 completed *******
URL 9 completed *******
URL 10 completed *******
Total product pages scrapped:  10
Final Total Products:  10
Total Unavailable Products :  0
Total Duplicate Products:  0
Session End Time: 15:57:24.174974 ---------------------------> 
