In [None]:
import requests
import xml.etree.ElementTree as ET
import gzip
import io
import csv

In [None]:
def extract_flipkart_urls(index_url: str, output_path: str):
    """
    Extract all product URLs from a given Flipkart sitemap index XML.

    Args:
        index_url (str): Flipkart sitemap index URL (e.g. 'https://www.flipkart.com/sitemap_p_product_index_1.xml')
        output_path (str): Path to save the extracted URLs (e.g. './data/urls_index_1.csv')
    """

    # print which sitemap index file is being processed
    print(f"Processing sitemap index: {index_url}")
    
    # initialize an empty list to store all product URLs found
    all_product_urls = []

    try:
        # send a GET request to fetch the sitemap index XML file
        resp = requests.get(index_url, timeout=15)
        
        # raise an error if the request was not successful (non-200 status code)
        resp.raise_for_status()

        # parse XML content of the sitemap index file
        root = ET.fromstring(resp.content)
        
        # define XML namespace mapping used in Flipkartâ€™s sitemap format
        ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}

        # iterate over each <sitemap> element inside the sitemap index
        for sitemap in root.findall("sm:sitemap", ns):
            
            # extract <loc> tag inside <sitemap>, which contains the URL of a gzipped sitemap file
            loc_tag = sitemap.find("sm:loc", ns)
            
            # skip if no <loc> tag is found (invalid or empty sitemap entry)
            if loc_tag is None:
                continue

            # extract actual URL (link to a .xml.gz file)
            gz_url = loc_tag.text
            print(f"Found sitemap: {gz_url}")

            try:
                # send GET request to fetch the gzipped XML file
                gz_resp = requests.get(gz_url, timeout=30)
                
                # raise an exception if the download failed
                gz_resp.raise_for_status()

                # decompress the .gz file content using gzip
                with gzip.GzipFile(fileobj=io.BytesIO(gz_resp.content)) as gz_file:
                    xml_content = gz_file.read()

                # parse the decompressed XML content to extract individual URLs
                url_root = ET.fromstring(xml_content)
                
                # loop over all <url> elements in the sitemap
                for url_tag in url_root.findall("sm:url", ns):
                    
                    # extract the <loc> tag within each <url> tag
                    loc_inner = url_tag.find("sm:loc", ns)
                    
                    # if <loc> exists, extract the actual product URL text
                    if loc_inner is not None:
                        all_product_urls.append(loc_inner.text)
            
            # handle exceptions that occur while parsing or downloading individual gzipped sitemaps
            except Exception as inner_e:
                print(f"Failed to parse {gz_url}: {inner_e}")
                continue

    # handle exceptions that occur while fetching or parsing the main sitemap index
    except Exception as e:
        print(f"Failed to process index file {index_url}: {e}")
        return

    # print the total number of URLs successfully extracted
    print(f"\nTotal URLs extracted: {len(all_product_urls)}")
    
    # open the output CSV file in write mode (UTF-8 encoded)
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        # create a CSV writer object
        writer = csv.writer(f)
        
        # write header row to the CSV file
        writer.writerow(["URL"])
        
        # write each extracted product URL on a new line
        for url in all_product_urls:
            writer.writerow([url])

    # confirm that the URLs have been saved successfully
    print(f"Saved URLs to {output_path}")

In [None]:
for i in range(32):
    extract_flipkart_urls(
        index_url=f"https://www.flipkart.com/sitemap_p_product_index_{i+1}.xml",
        output_path=f"./data/flipkart_urls_index_{i+1}.csv"
    )