In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
from tqdm import tqdm
import re
import configparser
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
cofig = configparser.RawConfigParser()
cofig.read('config.ini')
url = cofig['scrap_url']['url']
product_table_path = cofig['intermediate_path']['product_table']

In [3]:
def open_browser(path):
    '''
    It will create a Chrome driver

    Parameter:
    - path: url of the website

    Return:
    - driver: Chrome driver 
    '''
    try:
        driver = webdriver.Chrome()
        driver.get(path)

        time.sleep(10) # Wait for page to load
        
        driver.find_element(By.XPATH, '//button[text() ="Ask Me Later"]').click()

        print('Driver created successfully')
        return driver
    except Exception as e:
        raise e
    
def webpage_scroll(driver, scroll_value=1000):
    '''
    This function scroll the webpage.

    Parameters:
    - driver: Chrome driver
    - scroll_value: number of time to scrool the web-page

    Return:
    - driver: Chrome driver after scrolling
    '''
    try:
        for _ in tqdm(range(scroll_value), desc='Scrolling Products'):
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(3)  # Allow images to load

            try:
                show_more_button = driver.find_element(By.XPATH, '//button[text()="Show More Products"]')
                if show_more_button.is_displayed():
                    show_more_button.click()
            except:
                pass  # No button found, continue scrolling

        return driver
    except Exception as e:
        raise e
    
def extract_product_href(product_element):
    '''
    Extracts the product name and href from the anchor tag.
    '''
    product_href = 'No Link Found'
    product_name = product_element.get_attribute('title')  # Extract product title
    try:
        product_href = product_element.get_attribute('href')  # Extract href

        # Ensure absolute URL if href is relative
        if product_href and product_href.startswith('//'):
            product_href = 'https:' + product_href

    except:
        pass

    return product_name, product_href

def collect_data(driver):
    '''
    Extracts all product data by dynamically scrolling until no new products load.
    '''
    product_data = set()
    wait = WebDriverWait(driver, 10)

    try:
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

        wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "ProductModule__aTag")))

        product_cards = driver.find_elements(By.CLASS_NAME, "ProductModule__aTag")

        # Extract product data in parallel
        with ThreadPoolExecutor(max_workers=min(10, len(product_cards))) as executor:
            results = list(tqdm(
                executor.map(extract_product_href, product_cards),
                desc="Extracting Product Links",
                total=len(product_cards),
                unit="product",
                leave=False
            ))

        product_data.update((name, link) for name, link in results if link != 'No Link Found')

        final_product_data = list(product_data)
        print(f'\nTotal Products Extracted: {len(final_product_data)}')
        return final_product_data

    except Exception as e:
        print(f"Error occurred: {e}")
        return list(product_data) 

    finally:
        driver.quit() 


In [4]:
driver = open_browser(path=url)
driver = webpage_scroll(driver=driver, scroll_value=150)
product_data = collect_data(driver)

Driver created successfully


Scrolling Products: 100%|██████████| 150/150 [1:12:02<00:00, 28.81s/it]
                                                                                    


Total Products Extracted: 6036


In [13]:
scrap_data = pd.DataFrame(product_data, columns=['product_name', 'product_url'])
image_data = scrap_data[~(scrap_data['product_url']=='No Image Found')]
print(f'Total Products: {len(image_data)}')

Total Products: 6036


In [6]:
scrap_data = pd.DataFrame(product_data, columns=['product_name', 'product_url'])
image_data = scrap_data[~(scrap_data['product_url']=='No Image Found')]

pattern = r'mp\d+'
image_data['product_id'] = image_data['product_url'].apply(lambda x: re.findall(pattern, x)[0])

In [None]:

image_data.to_csv(product_table_path, index=False)