#### Use selenium and beautifulsoup to crawl data from lazada

In [4]:
# Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import csv
import os

In [5]:
def product_on_page(url, keyword):
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    search_box = browser.find_element(By.ID, "q")
    search_box.send_keys(keyword)
    search_box.submit()
    
    WebDriverWait(browser, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".Bm3ON"))
    )
    
    products = []
    current_page = 0
    
    while current_page < 1:
        print(f"Load {current_page + 1}...")
        product_elements = browser.find_elements(By.CSS_SELECTOR, ".Bm3ON")
        
        for product in product_elements:
            try:
                scroll_pause_time = 2   
                last_height = browser.execute_script("return document.body.scrollHeight")

                while True:
                    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(scroll_pause_time)

                    new_height = browser.execute_script("return document.body.scrollHeight")
                    if new_height == last_height:
                        break
                    last_height = new_height

                link_element = product.find_element(By.CSS_SELECTOR, "a")
                link = link_element.get_attribute("href")
                
                try:
                    price_element = product.find_element(By.CLASS_NAME, "ooOxS")
                    price = price_element.text
                except Exception:
                    price = None                  

                try:
                    sold_element = product.find_element(By.CLASS_NAME, "_1cEkb")
                    sold = sold_element.text
                except Exception:
                    sold = None   

                if sold is not None:
                    products.append({
                        "link": link,
                       "price": price,
                        "sold": sold
                    }) 
            except Exception as e:
                print(f"Error fetching product details: {e}")
        
        try:
            next_button = browser.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
            if "ant-pagination-disabled" in next_button.get_attribute("class"):
                print("Not next page.")
                break
            browser.execute_script("arguments[0].click();", next_button)
            time.sleep(2)   
            current_page += 1
        except Exception as e:
            print("Can not continue:", e)
            break
    
    browser.quit()
    
    return products

In [6]:
def get_product_details(url):
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    try:
        scroll_pause_time = 2 
        last_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            new_height = browser.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        product_name = browser.find_element(By.CSS_SELECTOR, "h1.pdp-mod-product-badge-title").text

        try:
            store_name = browser.find_element(By.CSS_SELECTOR, "div.seller-name__detail > a").text
        except Exception:
            store_name = "Không tìm thấy cửa hàng"

        try:
            rating = browser.find_element(By.CSS_SELECTOR, "span.score-average").text
        except Exception:
            rating = "Không có đánh giá"

        try:
            comment_count = browser.find_element(By.CSS_SELECTOR, "a.pdp-link.pdp-review-summary__link").text
        except Exception:
            comment_count = "Không có bình luận"

        return {
            "product_name": product_name,
            "store_name": store_name,
            "rating": rating,
            "comment_count": comment_count,
        }
    except Exception as e:
        print(f"Error fetching product details: {e}")
        return None
    finally:
        browser.quit()


In [7]:
url = 'https://www.lazada.vn/'
keyword = ['máy+fax']
for item in keyword:
    product_links = product_on_page(url, item)

Load 1...


In [8]:
product_detail = []

for product in product_links:
    print(f"Fetching details for {product['link']}...")
    details = get_product_details(product['link'])
    if details:
        details.update({
            "price": product["price"],
            "sold": product["sold"],
            "link": product["link"],
    })
    product_detail.append(details)

if product_detail:
    if not os.path.exists('../data/raw/product_details.csv'):
        write_header = True
    else:
        write_header = False

    with open("../data/raw/product_details.csv", "a", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["product_name", "store_name", "rating", "comment_count", "price", "sold", "link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        writer.writerows(product_detail)
    print("Saved product details to product_details.csv") 
else:
    print("No product detail to save")

Fetching details for https://www.lazada.vn/products/may-in-phun-mau-canon-mx-397-in-scan-copy-fax-i2497946073.html...
Fetching details for https://www.lazada.vn/products/mktel-m14-tinh-nang-dien-thoai-voi-4-sim-cho-man-hinh-24-pin-1100mah-mp3-mp4-dai-fm-cao-cap-dien-thoai-lat-i2894553843.html...
Fetching details for https://www.lazada.vn/products/may-in-laser-da-nang-co-fax-brother-mfc-l2701d-i886842165.html...
Fetching details for https://www.lazada.vn/products/panasonic-kx-ft983-may-fax-nhiet-doi-chot-i1263979713.html...
Fetching details for https://www.lazada.vn/products/hcm-may-in-da-chuc-nang-laser-hp-laserjet-pro-m1212nf-mfp-in-mang-scan-photo-copy-fax-i2616263806.html...
Fetching details for https://www.lazada.vn/products/hcmfilm-fax-kx-fa-57e-cho-may-panasonic-kx-fp-711-i2114061.html...
Saved product details to product_details.csv
