In [15]:
# selenium 4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd

chrome_options = Options()
chrome_options.add_argument("--headless")
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

products_list = []

# Start from the first page
driver.get("https://aldawaaegy.com/collections/korean-products")
page_count = 1

while True:
    print(f"Scraping page: {page_count}")
    try:
        # Wait for the products to be loaded on the page
        wait = WebDriverWait(driver, 20) # Wait for up to 20 seconds
        wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='product-info__inner']")))

        products = driver.find_elements(By.XPATH,"//div[@class='product-info__inner']")

        for product in products:
            try:
                provider = product.find_element(By.XPATH,".//div[@class='product-brand']/a")
                product_name = product.find_element(By.XPATH, ".//h3[@class='product-title pr fs__14 mg__0 fwm']/a")
                price = product.find_element(By.XPATH, ".//span")
                products_list.append({"Product Name": product_name.text , "Provider": provider.text, "price":price.text})
            except NoSuchElementException:
                # Handle cases where a product card might be missing some info
                print("A product was missing some information, skipping.")
                continue

        # Look for the "next" page button and click it
        next_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//div[@class='products-footer tc mt__40 mb__60']/a"))
        )
        driver.execute_script("arguments[0].click();", next_button)
        page_count += 1

    except TimeoutException:
        print("No more pages to load or page took too long to load.")
        break
    except NoSuchElementException:
        print("Reached the last page.")
        break

driver.quit()
df = pd.DataFrame(products_list)



Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
No more pages to load or page took too long to load.


In [16]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Product Name  264 non-null    str  
 1   Provider      264 non-null    str  
 2   price         264 non-null    str  
dtypes: str(3)
memory usage: 6.3 KB


In [19]:
# Convert to numeric, coercing errors to NaN
df['price'] = df['price'].str.replace(',', '').astype(float)


In [None]:
df.sort_values(by=["price"], ascending=False)

Unnamed: 0,Product Name,Provider,price
245,COSRX HYAIURONIC ACID +NAG SERUM 150 ML,COSRX,1735.0
185,COSRX HYAIURONIC ACID +NAG SERUM 150 ML,COSRX,1735.0
72,DR.ALTHEA 345 RELIEF CREAM 50ML,DR.ALTHEA,1667.0
108,DR.ALTHEA 345 RELIEF CREAM 50ML,DR.ALTHEA,1667.0
216,DR.ALTHEA 345 RELIEF CREAM 50ML,DR.ALTHEA,1667.0
...,...,...,...
57,KOREAN LIP BLAM ROSE 7 GM,WNP'L,84.0
44,KOREAN LIP BLAM SAKURA 7 GM,WNP'L,84.0
68,KOREAN LIP BLAM SAKURA 7 GM,WNP'L,84.0
21,KOREAN LIP BLAM ROSE 7 GM,WNP'L,84.0
