#### Use selenium and beautifulsoup to crawl data from lazada

In [None]:
# Install necesary packages
""" %pip install selenium 
%pip install beautifulsoup4
%pip install pandas
%pip install requests """

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import csv

In [None]:
def product_on_page(url, keyword):
    """
    Scrapes product information from a Lazada search page based on a given keyword.

    This function uses Selenium to search for a specific keyword on the Lazada website, 
    retrieves product links, prices, and sold counts for all products on the first page, 
    and optionally paginates through additional pages to gather more data.

    Parameters:
        url (str): The URL of the Lazada homepage or search page to start scraping.
        keyword (str): The keyword to search for on the Lazada platform.

    Returns:
        list: A list of dictionaries containing product information:
            - link (str): The URL of the product page.
            - price (str or None): The price of the product as displayed.
            - sold (str or None): The number of units sold, if available.
    """
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    search_box = browser.find_element(By.ID, "q")
    search_box.send_keys(keyword)
    search_box.submit()
    
    WebDriverWait(browser, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".Bm3ON"))
    )
    
    products = []
    current_page = 0
    
    while current_page < 1:
        print(f"Load {current_page + 1}...")
        product_elements = browser.find_elements(By.CSS_SELECTOR, ".Bm3ON")
        
        for product in product_elements:
            try:
                scroll_pause_time = 2   
                last_height = browser.execute_script("return document.body.scrollHeight")

                while True:
                    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(scroll_pause_time)

                    new_height = browser.execute_script("return document.body.scrollHeight")
                    if new_height == last_height:
                        break
                    last_height = new_height

                link_element = product.find_element(By.CSS_SELECTOR, "a")
                link = link_element.get_attribute("href")
                
                try:
                    price_element = product.find_element(By.CLASS_NAME, "ooOxS")
                    price = price_element.text
                except Exception:
                    price = None                  

                try:
                    sold_element = product.find_element(By.CLASS_NAME, "_1cEkb")
                    sold = sold_element.text
                except Exception:
                    sold = None   

                if sold is not None:
                    products.append({
                        "link": link,
                        "price": price,
                        "sold": sold
                    }) 
            except Exception as e:
                print(f"Error fetching product details: {e}")
        
        try:
            next_button = browser.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
            if "ant-pagination-disabled" in next_button.get_attribute("class"):
                print("Not next page.")
                break
            browser.execute_script("arguments[0].click();", next_button)
            time.sleep(2)   
            current_page += 1
        except Exception as e:
            print("Can not continue:", e)
            break
    
    browser.quit()
    
    return products

In [None]:
def get_product_details(url):
    """
    Access a Lazada product page and retrieve detailed product information.

    This function uses Selenium to navigate to the product page, scroll down to load all content,
    and extract information such as the product name, store name, rating, and the number of comments.

    Parameters:
        url (str): The URL of the Lazada product page.

    Returns:
        dict: A dictionary containing detailed product information:
            - product_name (str): The name of the product.
            - store_name (str): The name of the store. If not found, the value will be "Store not found".
            - rating (str): The product's rating. If no rating is available, the value will be "No rating".
            - comment_count (str): The number of comments on the product. If no comments are available, the value will be "No comments".
        None: Returns `None` if an error occurs during data retrieval.
    """
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    try:
        scroll_pause_time = 2 
        last_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            new_height = browser.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        product_name = browser.find_element(By.CSS_SELECTOR, "h1.pdp-mod-product-badge-title").text

        try:
            store_name = browser.find_element(By.CSS_SELECTOR, "div.seller-name__detail > a").text
        except Exception:
            store_name = "Không tìm thấy cửa hàng"

        try:
            rating = browser.find_element(By.CSS_SELECTOR, "span.score-average").text
        except Exception:
            rating = "Không có đánh giá"

        try:
            comment_count = browser.find_element(By.CSS_SELECTOR, "a.pdp-link.pdp-review-summary__link").text
        except Exception:
            comment_count = "Không có bình luận"

        return {
            "product_name": product_name,
            "store_name": store_name,
            "rating": rating,
            "comment_count": comment_count,
        }
    except Exception as e:
        print(f"Error fetching product details: {e}")
        return None
    finally:
        browser.quit()


In [None]:
url = 'https://www.lazada.vn/'
keyword = ['sạc+dự+phòng']
for item in keyword:
    product_links = product_on_page(url, item)

In [None]:
product_detail = []

for product in product_links:
    print(f"Fetching details for {product['link']}...")
    details = get_product_details(product['link'])
    if details:
        details.update({
            "price": product["price"],
            "sold": product["sold"],
            "link": product["link"],
    })
    product_detail.append(details)

if product_detail:
    with open("product_details.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["product_name", "store_name", "rating", "comment_count","price", "sold", "link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(product_detail)
    print("Saved product details to product_details.csv") 
else:
    print("No product detail to save") 

In [None]:
# Chuyển danh sách thành DataFrame
df = pd.DataFrame(product_links)

# Lưu DataFrame thành file CSV
df.to_csv("products.csv", index=False, encoding='utf-8')