In [17]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import time
from datetime import date, datetime
import random
import decimal
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys



In [None]:
# Utility Functions
def configure_browser_options():
    """Configure and return Chrome options for Selenium."""
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--incognito")
    options.add_argument("start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    return options

def delete_cache(driver):
    driver.execute_script("window.open('')")  # Create a separate tab than the main one
    driver.switch_to.window(driver.window_handles[-1])  # Switch window to the second tab
    #driver.get('chrome://settings/clearBrowserData')  # Open your chrome settings.
    #driver.findElement(By.xpath("//*[@id='clearBrowsingDataConfirm']")).click()
    driver.execute_cdp_cmd("Network.clearBrowserCache", {})
    perform_actions(driver, Keys.TAB * 2 + Keys.DOWN * 4 + Keys.TAB * 5 + Keys.ENTER)  # Tab to the time select and key down to say "All Time" then go to the Confirm button and press Enter
    driver.close()  # Close that window
    driver.switch_to.window(driver.window_handles[0])  # Switch Selenium controls to the original tab to continue normal functionality.

def perform_actions(driver, keys):
    """Perform keyboard actions."""
    actions = ActionChains(driver)
    actions.send_keys(keys)
    time.sleep(0.1)
    print('Performing Actions!')
    actions.perform()

def extract_product_data(product_listings, base_url, page):
    """Extract product data from listings and return DataFrame."""
    all_product_urls = pd.DataFrame()
    for product in product_listings:
        try:
            title_element = product.find("div", class_="KzDlHZ").text.strip()
            url_link_element = product.find("a", class_="CGtC98").get('href')
            rating_element = product.find("span", class_="Wphh3N").get_text(strip=True) if product.find("span", class_="Wphh3N") else ''
            price_element = product.find("div", class_="Nx9bqj _4b5DiR").text.strip() if product.find("div", class_="Nx9bqj _4b5DiR") else ''
            product_data = pd.DataFrame({
                "Title": [title_element],
                "Rating": [rating_element],
                "Price": [price_element],
                "Product URL": ["https://www.flipkart.com" + url_link_element],
                'page': [page]
            })
        except:
            product_data = pd.DataFrame()
        all_product_urls = pd.concat([all_product_urls, product_data], ignore_index=True)
    all_product_urls = all_product_urls[~all_product_urls['Product URL'].duplicated()]
    all_product_urls['listing URL'] = base_url
    return all_product_urls

def process_base_url(base_url, total_pages):
    """Process each base URL to extract product listings."""
    all_product_urls = pd.DataFrame()
    for page in range(1, total_pages + 1):
        url = base_url + "&page=" + str(page)
        print(url)
        response = requests.get(url)
        options = configure_browser_options()
        browser = Chrome(options=options)
        browser.get(base_url)
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')
        delete_cache(browser)
        time.sleep(float((random.randrange(5, 10))/100))
        product_listings = soup.select('div#container > div:first-child > div:nth-child(3) > div:first-child > div:nth-child(2) div[data-id]')
        page_product_urls = extract_product_data(product_listings, base_url, page)
        all_product_urls = pd.concat([all_product_urls, page_product_urls], ignore_index=True)
        browser.quit()
    return all_product_urls

def scrape_product_details(product_url):
    """Scrape detailed product information from a product URL."""
    product_df = pd.DataFrame()
    try:
        options = configure_browser_options()
        browser = Chrome(options=options)
        browser.get(product_url)
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')
        delete_cache(browser)
        time.sleep(float(decimal.Decimal(random.randrange(5, 10))/100))
        
        product_data = {
            "Product URL": [product_url],
            "Retailer": ['Flipkart'],
            "Scraping Date": [date.today()],
            "Scraping Time": [datetime.now()]
        }
        
        # Extract product details
        try:
            product_data["Product Title"] = soup.find("span", class_="VU-ZEz").get_text(strip=True)
        except:
            product_data["Product Title"] = ''
        
        # Extract price, MRP, discount, ratings, etc. (similar blocks)
        
        # Extract specifications
        product_spec_df = pd.DataFrame()
        for section in soup.find_all('div', class_='_3Fm-hO'):
            data = []
            for tr in section.find_all('tr'):
                row = [td.text for td in tr.find_all('td')]
                data.append(row)
            df = pd.DataFrame(data, columns=['Attribute_Name', 'Attribute_Value'])
            product_spec_df = pd.concat([product_spec_df, df])
        
        product_df = pd.DataFrame(product_data)
        if not product_spec_df.empty:
            product_df = pd.concat([product_df, product_spec_df], axis=0)
            cols = product_df.columns.drop(['Attribute_Name', 'Attribute_Value'])
            product_df[cols] = product_df[cols].ffill()
        
        browser.quit()
    except:
        pass
    return product_df

In [8]:
# Execution Logic for Testing
total_pages = 1
#Return DataFrame with Flipkart URLs and their categories
flipkart_urls = [
    'http://www.flipkart.com/air-conditioners/pr?sid=j9e,abm,c54',
    'http://www.flipkart.com/air-purifiers/pr?sid=j9e,abm,3o4'
]
categories = ['AC', 'Air Purifier']
flipkart_listing_urls =  pd.DataFrame({'URL': flipkart_urls, 'Category': categories})
flipkart_listing_urls

Unnamed: 0,URL,Category
0,http://www.flipkart.com/air-conditioners/pr?si...,AC
1,http://www.flipkart.com/air-purifiers/pr?sid=j...,Air Purifier


In [19]:
# Scrape product listings
final_collated_urls = pd.DataFrame()
for base_url in flipkart_listing_urls['URL'].unique().tolist():
    all_product_urls = process_base_url(base_url, total_pages)
    final_collated_urls = pd.concat([final_collated_urls, all_product_urls])
    
final_collated_urls.head()

http://www.flipkart.com/air-conditioners/pr?sid=j9e,abm,c54&page=1
Performing Actions!
http://www.flipkart.com/air-purifiers/pr?sid=j9e,abm,3o4&page=1
Performing Actions!


Unnamed: 0,Title,Rating,Price,Product URL,page,listing URL
0,Panasonic 2025 Model 1.5 Ton 3 Star Split Inve...,"26,121 Ratings&2,623 Reviews","₹37,490",https://www.flipkart.com/panasonic-2025-model-...,1,http://www.flipkart.com/air-conditioners/pr?si...
1,MarQ by Flipkart 2025 1 Ton 3 Star Split Inver...,563 Ratings&61 Reviews,"₹23,990",https://www.flipkart.com/marq-flipkart-2025-1-...,1,http://www.flipkart.com/air-conditioners/pr?si...
2,MarQ by Flipkart 2025 0.7 Ton 3 Star Split Inv...,563 Ratings&61 Reviews,"₹19,990",https://www.flipkart.com/marq-flipkart-2025-0-...,1,http://www.flipkart.com/air-conditioners/pr?si...
3,LG 2025 Mode AI Convertible 6-in-1 1.5 Ton 3 S...,"41,857 Ratings&3,701 Reviews","₹37,690",https://www.flipkart.com/lg-2025-mode-ai-conve...,1,http://www.flipkart.com/air-conditioners/pr?si...
4,Voltas 1 Ton 5 Star Split Inverter AC - White,4 Ratings&1 Reviews,"₹35,990",https://www.flipkart.com/voltas-1-ton-5-star-s...,1,http://www.flipkart.com/air-conditioners/pr?si...


In [None]:
# Scrape product details
final_scrapped_df = pd.DataFrame()
product_url_list = final_collated_urls['Product URL'].unique().tolist()
for product_url in product_url_list :
    product_df = scrape_product_details(product_url)
    final_scrapped_df = pd.concat([final_scrapped_df, product_df])

final_scrapped_df.head()