#### Use selenium and beautifulsoup to crawl data from lazada

In [9]:
# Install necesary packages
""" %pip install selenium 
%pip install beautifulsoup4
%pip install pandas
%pip install requests """

' %pip install selenium \n%pip install beautifulsoup4\n%pip install pandas\n%pip install requests '

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import csv

In [2]:
def product_on_page(url, keyword):
    """
    Scrapes product information from a Lazada search page based on a given keyword.

    This function uses Selenium to search for a specific keyword on the Lazada website, 
    retrieves product links, prices, and sold counts for all products on the first page, 
    and optionally paginates through additional pages to gather more data.

    Parameters:
        url (str): The URL of the Lazada homepage or search page to start scraping.
        keyword (str): The keyword to search for on the Lazada platform.

    Returns:
        list: A list of dictionaries containing product information:
            - link (str): The URL of the product page.
            - price (str or None): The price of the product as displayed.
            - sold (str or None): The number of units sold, if available.
    """
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    search_box = browser.find_element(By.ID, "q")
    search_box.send_keys(keyword)
    search_box.submit()
    
    WebDriverWait(browser, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".Bm3ON"))
    )
    
    products = []
    current_page = 0
    
    while current_page < 1:
        print(f"Load {current_page + 1}...")
        product_elements = browser.find_elements(By.CSS_SELECTOR, ".Bm3ON")
        
        for product in product_elements:
            try:
                scroll_pause_time = 2   
                last_height = browser.execute_script("return document.body.scrollHeight")

                while True:
                    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(scroll_pause_time)

                    new_height = browser.execute_script("return document.body.scrollHeight")
                    if new_height == last_height:
                        break
                    last_height = new_height

                link_element = product.find_element(By.CSS_SELECTOR, "a")
                link = link_element.get_attribute("href")
                
                try:
                    price_element = product.find_element(By.CLASS_NAME, "ooOxS")
                    price = price_element.text
                except Exception:
                    price = None                  

                try:
                    sold_element = product.find_element(By.CLASS_NAME, "_1cEkb")
                    sold = sold_element.text
                except Exception:
                    sold = None   

                if sold is not None:
                    products.append({
                        "link": link,
                        "price": price,
                        "sold": sold
                    }) 
            except Exception as e:
                print(f"Error fetching product details: {e}")
        
        try:
            next_button = browser.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
            if "ant-pagination-disabled" in next_button.get_attribute("class"):
                print("Not next page.")
                break
            browser.execute_script("arguments[0].click();", next_button)
            time.sleep(2)   
            current_page += 1
        except Exception as e:
            print("Can not continue:", e)
            break
    
    browser.quit()
    
    return products

In [3]:
def get_product_details(url):
    """
    Access a Lazada product page and retrieve detailed product information.

    This function uses Selenium to navigate to the product page, scroll down to load all content,
    and extract information such as the product name, store name, rating, and the number of comments.

    Parameters:
        url (str): The URL of the Lazada product page.

    Returns:
        dict: A dictionary containing detailed product information:
            - product_name (str): The name of the product.
            - store_name (str): The name of the store. If not found, the value will be "Store not found".
            - rating (str): The product's rating. If no rating is available, the value will be "No rating".
            - comment_count (str): The number of comments on the product. If no comments are available, the value will be "No comments".
        None: Returns `None` if an error occurs during data retrieval.
    """
    browser = webdriver.Chrome()
    browser.maximize_window()
    browser.get(url)
    browser.implicitly_wait(120) 

    try:
        scroll_pause_time = 2 
        last_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            new_height = browser.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        product_name = browser.find_element(By.CSS_SELECTOR, "h1.pdp-mod-product-badge-title").text

        try:
            store_name = browser.find_element(By.CSS_SELECTOR, "div.seller-name__detail > a").text
        except Exception:
            store_name = "Không tìm thấy cửa hàng"

        try:
            rating = browser.find_element(By.CSS_SELECTOR, "span.score-average").text
        except Exception:
            rating = "Không có đánh giá"

        try:
            comment_count = browser.find_element(By.CSS_SELECTOR, "a.pdp-link.pdp-review-summary__link").text
        except Exception:
            comment_count = "Không có bình luận"

        return {
            "product_name": product_name,
            "store_name": store_name,
            "rating": rating,
            "comment_count": comment_count,
        }
    except Exception as e:
        print(f"Error fetching product details: {e}")
        return None
    finally:
        browser.quit()


In [6]:
url = 'https://www.lazada.vn/'
keyword = ['màn hình di động']
for item in keyword:
    product_links = product_on_page(url, item)

Load 1...


In [5]:
print(product_links)

[{'link': 'https://www.lazada.vn/products/robot-hut-bui-lau-nha-es330-luc-hut-cuc-manh-3000pa-quet-hut-lau-cung-luc-bao-hanh-24-thang-i2805083150.html', 'price': '₫562,000', 'sold': '929 sold'}, {'link': 'https://www.lazada.vn/products/robot-hut-bui-sweepin-robot-hut-bui-xiaomi-may-hut-bui-thong-minh-tu-dong-cho-gia-dinh-hut-bui-quet-toc-i2457976213.html', 'price': '₫163,404', 'sold': '475 sold'}, {'link': 'https://www.lazada.vn/products/chat-luong-cao-nhat-xiaomimijia-da-san-sang-de-giao-hang-robot-quet-nha-kaimeidi-quet-va-lau-nha-thong-minh-bon-trong-mot-sac-usb-i2127466042.html', 'price': '₫80,000', 'sold': '4.2K sold'}, {'link': 'https://www.lazada.vn/products/sada-robot-quet-nha-quet-va-lau-nha-thong-minh-bon-trong-mot-sac-usb-i2066978718.html', 'price': '₫80,000', 'sold': '741 sold'}, {'link': 'https://www.lazada.vn/products/chat-luong-cao-nhat-sada-robot-quet-thong-minh-may-lau-dien-lam-sach-tu-dong-may-hut-bui-don-dep-toan-bo-ngoi-nha-khoi-dong-bang-mot-phim-i1664977542.html',

In [7]:
product_detail = []

for product in product_links:
    print(f"Fetching details for {product['link']}...")
    details = get_product_details(product['link'])
    if details:
        details.update({
            "price": product["price"],
            "sold": product["sold"],
            "link": product["link"],
    })
    product_detail.append(details)

if product_detail:
    with open("product_details.csv", "a", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["product_name", "store_name", "rating", "comment_count","price", "sold", "link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(product_detail)
    print("Saved product details to product_details.csv") 
else:
    print("No product detail to save") 

Fetching details for https://www.lazada.vn/products/free-giftman-hinh-di-dong-goojodoq-156-14-inch-19201080p-fhd-ips-usb-c-hdmi-portable-monitor-cho-laptop-may-tinh-dien-thoai-may-choi-game-ps4-i2478628024.html...
Fetching details for https://www.lazada.vn/products/man-hinh-di-dong-goojodoq-14inch-portable-monitor-ips-25k-19201080p-fhd-hdmi-de-keo-dai-loa-ps4-dien-bao-hanh-chinh-hang-i2610558168.html...
Fetching details for https://www.lazada.vn/products/man-hinh-chieu-di-dong-60-150-inch-169-man-hinh-may-chieu-mau-trang-mo-cho-phim-ngoai-troi-du-lich-rap-hat-tai-nha-i2725202234.html...
Fetching details for https://www.lazada.vn/products/boe-man-hinh-di-dong-ips-fhd25k-60hz-144hz-type-c-hdmi-portable-monitor-i2743446576.html...
Fetching details for https://www.lazada.vn/products/man-hinh-cam-ung-di-dong-thong-minh-22-inch-tomko-gowithme-p2152j-ma-android-12-i-ram-4g-i-rom-64g-i-60-hz-i2431855899.html...
Fetching details for https://www.lazada.vn/products/man-hinh-di-dong-full-hd-ips-ke

WebDriverException: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=131.0.6778.109)
Stacktrace:
	GetHandleVerifier [0x00007FF67C276CF5+28821]
	(No symbol) [0x00007FF67C1E3880]
	(No symbol) [0x00007FF67C08578A]
	(No symbol) [0x00007FF67C082E88]
	(No symbol) [0x00007FF67C073BF9]
	(No symbol) [0x00007FF67C07594F]
	(No symbol) [0x00007FF67C073EBF]
	(No symbol) [0x00007FF67C07371B]
	(No symbol) [0x00007FF67C07365A]
	(No symbol) [0x00007FF67C071274]
	(No symbol) [0x00007FF67C071B3C]
	(No symbol) [0x00007FF67C08885A]
	(No symbol) [0x00007FF67C1201FE]
	(No symbol) [0x00007FF67C0FF2FA]
	(No symbol) [0x00007FF67C11F412]
	(No symbol) [0x00007FF67C0FF0A3]
	(No symbol) [0x00007FF67C0CA778]
	(No symbol) [0x00007FF67C0CB8E1]
	GetHandleVerifier [0x00007FF67C5AFCED+3408013]
	GetHandleVerifier [0x00007FF67C5C745F+3504127]
	GetHandleVerifier [0x00007FF67C5BB63D+3455453]
	GetHandleVerifier [0x00007FF67C33BDFB+835995]
	(No symbol) [0x00007FF67C1EEB9F]
	(No symbol) [0x00007FF67C1EA854]
	(No symbol) [0x00007FF67C1EA9ED]
	(No symbol) [0x00007FF67C1DA1D9]
	BaseThreadInitThunk [0x00007FFDCC7BE8D7+23]
	RtlUserThreadStart [0x00007FFDCD85FBCC+44]


In [25]:
product_data = pd.read_csv('product_details.csv')

product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_name   72 non-null     object
 1   store_name     71 non-null     object
 2   rating         72 non-null     object
 3   comment_count  72 non-null     object
 4   price          72 non-null     object
 5   sold           72 non-null     object
 6   link           72 non-null     object
dtypes: object(7)
memory usage: 4.1+ KB
