In [5]:
# Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import csv
import os

In [6]:
def product_on_page(url, keyword):
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(10)  # Reduced wait time

    search_box = browser.find_element(By.ID, "q")
    search_box.send_keys(keyword)
    search_box.submit()
    
    WebDriverWait(browser, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".Bm3ON"))
    )

    # Automatic scrolling function
    def scroll_to_bottom():
        last_height = browser.execute_script("return document.body.scrollHeight")
        while True:
            # Scroll down to the bottom
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for new content to load
            
            # Calculate new scroll height and compare with last scroll height
            new_height = browser.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    current_page = 0
    products = []    

    while current_page < 1:
        print(f"Load {current_page + 1}...")
        
        scroll_to_bottom()
        
        product_elements = browser.find_elements(By.CSS_SELECTOR, ".Bm3ON")
               
        for product in product_elements:
            try:
                link_element = product.find_element(By.CSS_SELECTOR, "a")
                link = link_element.get_attribute("href")
                
                try:
                    price_element = product.find_element(By.CLASS_NAME, "ooOxS")
                    price = price_element.text if price_element else None
                except Exception:
                    price = "N/A"

                try:
                    sold_element = product.find_element(By.CLASS_NAME, "_1cEkb")
                    sold = sold_element.text if sold_element else None
                except Exception:
                    sold = "N/A"

                if sold is not None:
                    product_data = {
                        "link": link,
                        "price": price,
                        "sold": sold
                    }
                    products.append(product_data)
            except Exception as e:
                print(f"Error fetching product details: {e}")
                browser.quit()  # Close the browser on error
                return [] # Stop the program if there is an error
        
        try:
            next_button = browser.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
            if "ant-pagination-disabled" in next_button.get_attribute("class"):
                print("Not next page.")
                break
            browser.execute_script("arguments[0].click();", next_button)
            time.sleep(1)  # Reduced sleep time
            current_page += 1
        except Exception as e:
            print("Can not continue:", e)
            browser.quit()  # Close the browser on error
            return [] # Stop the program if there is an error
    
    browser.quit()
    return products

In [7]:
url = 'https://www.lazada.vn/'
keyword = ['fpv drone']
for item in keyword:
    product_links = product_on_page(url, item)

Load 1...


In [8]:
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def get_product_details(url):
    # Cấu hình trình duyệt với User-Agent
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")  # Ẩn trình duyệt tự động
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")
    
    browser = webdriver.Chrome(options=options)
    browser.get(url)
    browser.implicitly_wait(10)  # Giảm thời gian chờ xuống mức hợp lý

    try:
        # Cuộn trang với delay ngẫu nhiên
        scroll_pause_time = random.uniform(2, 5)  # Delay từ 2-5 giây
        max_scrolls = random.randint(3, 6)  # Giới hạn số lần cuộn
        for _ in range(max_scrolls):
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

        # Trích xuất thông tin sản phẩm
        product_name = browser.find_element(By.CSS_SELECTOR, "h1.pdp-mod-product-badge-title").text

        try:
            store_name = browser.find_element(By.CSS_SELECTOR, "div.seller-name__detail > a").text
        except Exception:
            store_name = "Không tìm thấy cửa hàng"

        try:
            rating = browser.find_element(By.CSS_SELECTOR, "span.score-average").text
        except Exception:
            rating = "Không có đánh giá"

        try:
            comment_count = browser.find_element(By.CSS_SELECTOR, "a.pdp-link.pdp-review-summary__link").text
        except Exception:
            comment_count = "Không có bình luận"

        # Delay ngẫu nhiên sau mỗi lần crawl
        time.sleep(random.uniform(5, 10))

        return {
            "product_name": product_name,
            "store_name": store_name,
            "rating": rating,
            "comment_count": comment_count,
        }
    except Exception as e:
        print(f"Error fetching product details: {e}")
        return None
    finally:
        browser.quit()


In [9]:
for product in product_links:
    print(f"Fetching details for {product['link']}...")
    details = get_product_details(product['link'])
    if details:
        details.update({
            "price": product["price"],
            "sold": product["sold"],
            "link": product["link"],
        })
        
        # Write to CSV immediately after fetching details
        if not os.path.exists("../data/raw/product_details.csv"):
            write_header = True
        else:
            write_header = False

        with open("../data/raw/product_details.csv", "a", newline="", encoding="utf-8") as csvfile:
            fieldnames = ["product_name", "store_name", "rating", "comment_count", "price", "sold", "link"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if write_header:
                writer.writeheader()
            writer.writerow(details)  # Write the details of the current product
        print(f"Saved details for {product['link']} to product_details.csv")
    else:
        print(f"Error fetching details for {product['link']}. Stopping the program.")
        break  # Stop the program if there is an error

Fetching details for https://www.lazada.vn/products/drone-mini-gia-re-plycam-e99-pro-cam-ung-chong-va-cham-tu-dong-camera-kep-4k-sieu-suc-net-co-den-led-tich-hop-i2582366125.html...
Saved details for https://www.lazada.vn/products/drone-mini-gia-re-plycam-e99-pro-cam-ung-chong-va-cham-tu-dong-camera-kep-4k-sieu-suc-net-co-den-led-tich-hop-i2582366125.html to product_details.csv
Fetching details for https://www.lazada.vn/products/may-bay-dieu-khien-khong-nguoi-lai-e99-pro-flycam-mini-gia-re-camera-kep-4k-cuc-sac-net-cam-ung-chong-va-cham-thong-minh-nhao-lon-360-do-i2582319102.html...
Saved details for https://www.lazada.vn/products/may-bay-dieu-khien-khong-nguoi-lai-e99-pro-flycam-mini-gia-re-camera-kep-4k-cuc-sac-net-cam-ung-chong-va-cham-thong-minh-nhao-lon-360-do-i2582319102.html to product_details.csv
Fetching details for https://www.lazada.vn/products/flycam-e88-max-uav-camera-kep-goc-rong-4k-ho-tro-truyen-tin-hieu-wifi-5g-may-bay-khong-nguoi-lai-co-the-tu-do-chuyen-do-i2685765286.