In [None]:
# imports

# for sending HTTP requests to Flipkart pages
import requests

# for loading/saving URL lists and scraped results
import pandas as pd

# for timing execution and sleep delays
import time

# for random sleep intervals (avoid rate-limiting)
import random

# for parsing HTML content
from bs4 import BeautifulSoup

# for progress bars during scraping
from tqdm import tqdm

# for multithreading (processing URL chunks)
import concurrent.futures

In [None]:
HEADERS = {
    # identify the request as from a real browser
    "User-Agent": 
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    
    # ask server to respond in english
    "Accept-Language": "en-US,en;q=0.9"
}

In [3]:
def extract_product_details(soup):
    """Extract all key-value product details and build a readable description."""
    
    # dictionary to store extracted product attributes
    details = {}
    
    # find the main container holding product specifications
    detail_section = soup.find("div", class_="Cnl9Jt")
    
    if detail_section:
        # each specification row is inside <div class="row">
        rows = detail_section.find_all("div", class_="row")
        
        # iterate through each row to extract keyâ€“value pairs
        for row in rows:
            
            # each row contains exactly two columns:
            # - key column: class 'col-3-12'
            # - value column: class 'col-9-12'
            cols = row.find_all("div", class_=["col-3-12", "col-9-12"])
            
            # only process rows that contain both key and value
            if len(cols) == 2:
                
                # extract text for key and value, removing extra whitespace
                key = cols[0].get_text(strip=True)
                val = cols[1].get_text(strip=True)
                
                # store in dictionary
                details[key] = val

    # convert the extracted details dictionary into a single readable string
    if details:
        
        # join key-value pairs into: "Key: Value; Key: Value; ..."
        description = "; ".join([f"{k}: {v}" for k, v in details.items()])
    else:
        # if no details found, return None
        description = None

    # return the final formatted description text
    return description

In [None]:
def scrape_product(url):
    """Scrape a single Flipkart product page."""
    try:
        # send GET request with headers to avoid bot-blocking
        res = requests.get(url, headers=HEADERS, timeout=10)

        # if request fails or page not found, return None
        if res.status_code != 200:
            return None

        soup = BeautifulSoup(res.text, "html.parser")

        # TITLE
        title = soup.find("h1", class_="_6EBuvT")
        if title:
            title = title.get_text(strip=True)
        else:
            # same template family -> title can appear here
            title = soup.find("span", class_="yhB1No")
            title = title.get_text(strip=True) if title else None

        # PRICE (MRP)
        price = soup.find("div", class_="yRaY8j")
        price = price.get_text(strip=True) if price else None

        # DISCOUNT %
        discount = soup.find("div", class_="UkUFwK")
        discount = discount.get_text(strip=True) if discount else None

        # DISCOUNTED PRICE (selling price)
        discounted_price_el = soup.find("div", class_="Nx9bqj")
        discounted_price = discounted_price_el.get_text(strip=True) if discounted_price_el else None

        # RATING VALUE
        rating_value = soup.find("div", class_="XQDdHH")
        rating_value = rating_value.get_text(strip=True) if rating_value else None

        # RATING & REVIEW COUNTS
        rating_counts, review_counts = None, None
        rating_review_text = soup.find("span", class_="Wphh3N")

        if rating_review_text:
            text = rating_review_text.get_text(strip=True)
            parts = text.split("and")
            if len(parts) == 2:
                rating_counts = parts[0].replace("ratings", "").strip()
                review_counts = parts[1].replace("reviews", "").strip()

        # DESCRIPTION
        # try custom detail extractor; fallback to a generic text block
        description = extract_product_details(soup)
        if not description:
            desc_block = soup.find("div", class_="yN+eNk")
            description = desc_block.get_text(strip=True) if desc_block else None

        # HIERARCHY
        hierarchy = None
        hierarchy_section = soup.find("div", class_="DOjaWF")
        if hierarchy_section:
            # breadcrumb links
            crumbs = hierarchy_section.find_all("a", class_="R0cyWM")
            hierarchy_list = [c.get_text(strip=True) for c in crumbs]
            hierarchy = " > ".join(hierarchy_list)

            # last element in breadcrumb (non-clickable)
            final_text = hierarchy_section.find("div", class_="KalC6f")
            if final_text:
                hierarchy += " > " + final_text.get_text(strip=True)

        # RESULT
        data = {
            "url": url,
            "title": title,
            "price": price,
            "discount": discount,
            "discounted_price": discounted_price,
            "rating_value": rating_value,
            "rating_counts": rating_counts,
            "review_counts": review_counts,
            "hierarchy": hierarchy,
            "description": description,
        }
        
        return data

    except Exception as e:
        # scraping errors or network issues
        return None

In [None]:
def process_url(url):
    """Scrape one URL, log result, return scraped data."""
    
    try:
        # attempt to scrape product information from the given URL
        data = scrape_product(url)

        # random delay to avoid hitting the server too frequently
        time.sleep(random.uniform(0.5, 1.5))
        
        # if scraping was successful, return the scraped data
        if data:
            return data
    
    except Exception as e:
        # in case of any scraping or network error, return None
        return None

In [6]:
def process_chunk(url_chunk):
    """Process a chunk of URLs and return list of results."""
    
    chunk_results = []
    
    # iterate through URLs inside chunk
    for url in url_chunk:
        try:
            # scrape individual URL
            data = process_url(url)
            
            # only append valid scraped results
            if data:
                chunk_results.append(data)
                
        # ignore URLs that error out (timeouts, parsing failures)
        except Exception:
            continue
    return chunk_results

In [None]:
MAX_WORKERS = 100
CHUNK_SIZE = 10
OUTPUT_DIR = "./data/sample-scraped"
file = './data/new-sample/urls_part_2.csv'
file_index = '2'

In [None]:
# load URLs for this part
df = pd.read_csv(file)
urls = df['url'].tolist()

results = []
start = time.time()

# split URL list into smaller chunks for parallel processing
chunks = [urls[i:i + CHUNK_SIZE] for i in range(0, len(urls), CHUNK_SIZE)]

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

    # submit each chunk as a separate task to the thread pool
    futures = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}

    # iterate over completed tasks
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):

        try:
            # collect chunk result
            data = future.result()
            
            if data:
                # append scraped records
                results.extend(data)

        except Exception:
            continue

        # periodic saving
        if len(results) > 0 and len(results) % CHUNK_SIZE == 0:
            out_file = f"{OUTPUT_DIR}/flipkart_products_scraped_part_{file_index}.csv"
            pd.DataFrame(results).to_csv(out_file, index=False)
            print(f"Saved {len(results)} records for part {file_index} so far...")

end = time.time()

# final save for entire part
out_file = f"{OUTPUT_DIR}/flipkart_products_scraped_part_{file_index}.csv"
pd.DataFrame(results).to_csv(out_file, index=False)

print(f"\nCompleted Part {file_index}: Scraped {len(results)} products")
print(f"Time: {round((end - start) / 60, 2)} minutes\n")