### CRAWL DATA "DienThoai" FROM LAZADA

### 1. Setting selenium and initialize Chrome

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from time import sleep
import random
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException,  TimeoutException, ElementNotInteractableException
from tqdm import tqdm

In [2]:
#URL Lazada: dienthoai, sort price hight to low
main_link = 'https://www.lazada.vn/catalog/?page=1&q=dien%20thoai&sort=pricedesc'
path = 'chromedriver.exe'

# Customize Chrome browser settings
chrome_options = Options()                                     
chrome_options.add_argument('--no-sandbox')                     #Them tuy chon de chay Chrome ko co sandbox, thuong duoc su dung de tranh cac van de ve quyen truy cap he thong
chrome_options.add_argument('--disable-notifications')          #Tat thong bao tu trinh duyet
chrome_options.add_argument('--disable-infobars')               #Tat thanh thong tin cua trinh duyet

service = Service(executable_path=path)
driver = webdriver.Chrome(service=service, options=chrome_options)

#Open the Lazada web
driver.get(main_link)

### 2. Crawl data on 1 page

##### Outside page

In [3]:
#Crawl the information displayed on the outside of the page
def get_Data_On_Page():
    #The information is definitely there
    Type = [elem.get_attribute("value") for elem in driver.find_elements(By.CSS_SELECTOR, ".search-box__input--O34g")]
    Title = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")]
    Link = [elem.get_attribute('href') for elem in driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")]
    Price_sale = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".ooOxS")]
    
    #The information may or may not be available
    #If not, assign None
    Sale_off, Total_sold, Preview = [], [], []
    for i in range(1, len(Title)+1):
        try:
            temp_sale = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[4]/span".format(i))
            Sale_off.append(temp_sale.text)
        except NoSuchElementException:
            Sale_off.append(None)

        try:
            temp_sold = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[5]/span[1]/span[1]".format(i))
            Total_sold.append(temp_sold.text)
        except NoSuchElementException:
            Total_sold.append(None)

        try:
            temp_preview = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[5]/div/span".format(i))
            Preview.append(temp_preview.text)
        except NoSuchElementException:
            Preview.append(None)
    
    Location = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".oa6ri")]
    if len(Type) == 1:
        Type = Type * len(Title)

    #Create a dictionary with the lists
    data = {
        "Type": Type,
        "Title": Title,
        "Link": Link,
        "Price_sale": Price_sale,
        "Sale_off": Sale_off,
        "Total_sold": Total_sold,
        "Preview": Preview,
        "Location": Location
    }

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(data)
    return df

In [4]:
#Call the function and get the DF
dienthoai = get_Data_On_Page()
dienthoai.index = np.arange(1, len(dienthoai)+1)
dienthoai.head()

Unnamed: 0,Type,Title,Link,Price_sale,Sale_off,Total_sold,Preview,Location
1,dien thoai,Điện thoại OPPO FIND N3 (16GB/512GB) - Hàng ch...,https://www.lazada.vn/products/dien-thoai-oppo...,41.990.000 ₫,7% Off,11 Đã bán,,Hồ Chí Minh
2,dien thoai,[Trả góp 0%] Apple iPhone 15 Pro 1TB Chính hãn...,https://www.lazada.vn/products/tra-gop-0-apple...,39.890.000 ₫,10% Off,,,Hà Nội
3,dien thoai,Điện thoại Samsung Galaxy S24 Ultra 12GB/1TB -...,https://www.lazada.vn/products/dien-thoai-sams...,37.590.001 ₫,16% Off,24 Đã bán,(2),Hồ Chí Minh
4,dien thoai,Samsung Galaxy S23 Ultra 5G 8GB | 512GB - Màn ...,https://www.lazada.vn/products/samsung-galaxy-...,36.990.000 ₫,,,,Hồ Chí Minh
5,dien thoai,iPhone 15 Pro Max 256GB/512GB/1T [Futureworld-...,https://www.lazada.vn/products/iphone-15-pro-m...,32.126.000 ₫,,28 Đã bán,(1),Hồ Chí Minh


##### Each Product

In [5]:
#Get the link and access each product to get more features
def get_Data_Detail_1Product(link):
    driver.get(link)                            #Get link
    sleep(random.uniform(2, 5))                 #Random sleep to mimic human behavior and avoid getting blocked
    
    Price_original = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".notranslate.pdp-price.pdp-price_type_deleted.pdp-price_color_lightgray.pdp-price_size_xs")]
    Ship_price = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item.delivery-option-item_type_standard .delivery-option-item__body .delivery-option-item__shipping-fee.no-subtitle")]
    Return = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item.delivery-option-item_type_returnPolicy30 .delivery-option-item__body .delivery-option-item__info .delivery-option-item__title")]

    # Sale_rating & Ship_on_time & Chat_respone
    percentage = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".seller-info-value ")]
    Sale_rating = percentage[0] if len(percentage) > 0 else None
    Ship_on_time = percentage[1] if len(percentage) > 1 else None
    Chat_response = percentage[2] if len(percentage) > 2 else None


    #Cuộn trang từ từ để tải phần đánh giá sao
    while True:
        driver.execute_script("window.scrollBy(0, 400);")
        sleep(1)  #Đợi một chút để nội dung tải
        try:
            stars = driver.find_elements(By.CSS_SELECTOR, ".detail .percent")
            if len(stars) >= 5:
                break
        except NoSuchElementException:
            continue
    
    One_star, Two_star, Three_star, Four_star, Five_star = [], [], [], [], []
    stars = [elem.text for elem in stars]

    One_star.append(stars[4] if len(stars) > 4 else 'N/A')
    Two_star.append(stars[3] if len(stars) > 3 else 'N/A')
    Three_star.append(stars[2] if len(stars) > 2 else 'N/A')
    Four_star.append(stars[1] if len(stars) > 1 else 'N/A')
    Five_star.append(stars[0] if len(stars) > 0 else 'N/A')


    #Create a dictionary with the lists
    detail = {
        "Price_original": Price_original[0] if Price_original else None,
        "Ship_price": Ship_price[0] if Ship_price else None,
        "Return": Return[0] if Return else None,
        "Sale_rating": Sale_rating,
        "Ship_on_time": Ship_on_time,
        "Chat_response": Chat_response,
        "One_star": One_star,
        "Two_star": Two_star,
        "Three_star": Three_star,
        "Four_star": Four_star,
        "Five_star": Five_star
    }
    
    return detail 

In [6]:
#Initialize dictionary to store details for all products
detail_products = {
    "Price_original": [],
    "Ship_price": [],
    "Return": [],
    "Sale_rating": [],
    "Ship_on_time": [],
    "Chat_response": [],
    "One_star": [],
    "Two_star": [],
    "Three_star": [],
    "Four_star": [],
    "Five_star": []
}

#Create a progress bar to track the crawl process
for link in tqdm(dienthoai.Link, desc="Crawling product details"):
    details = get_Data_Detail_1Product(link)
    for key in detail_products.keys():
        detail_products[key].append(details[key])

Crawling product details: 100%|██████████| 40/40 [19:19<00:00, 28.99s/it]


In [7]:
#Add columns to dienthoai
for key, value in detail_products.items():
    dienthoai[key] = value

In [8]:
#Convert to csv
page_01 = dienthoai.to_csv("page_01.csv", index=False)
dienthoai

Unnamed: 0,Type,Title,Link,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Return,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
1,dien thoai,Điện thoại OPPO FIND N3 (16GB/512GB) - Hàng ch...,https://www.lazada.vn/products/dien-thoai-oppo...,41.990.000 ₫,7% Off,11 Đã bán,,Hồ Chí Minh,44.990.000 ₫,20.100 ₫,30 Ngày Trả Hàng Miễn Phí,99%,100%,75%,[0],[0],[0],[0],[0]
2,dien thoai,[Trả góp 0%] Apple iPhone 15 Pro 1TB Chính hãn...,https://www.lazada.vn/products/tra-gop-0-apple...,39.890.000 ₫,10% Off,,,Hà Nội,44.490.000 ₫,38.000 ₫,30 Ngày Trả Hàng Miễn Phí,99%,100%,85%,[0],[0],[0],[0],[0]
3,dien thoai,Điện thoại Samsung Galaxy S24 Ultra 12GB/1TB -...,https://www.lazada.vn/products/dien-thoai-sams...,37.590.001 ₫,16% Off,24 Đã bán,(2),Hồ Chí Minh,44.490.000 ₫,,30 Ngày Trả Hàng Miễn Phí,90%,Không đủ thông tin,100%,[0],[0],[0],[0],[2]
4,dien thoai,Samsung Galaxy S23 Ultra 5G 8GB | 512GB - Màn ...,https://www.lazada.vn/products/samsung-galaxy-...,36.990.000 ₫,,,,Hồ Chí Minh,,16.500 ₫,30 Ngày Trả Hàng Miễn Phí,98%,82%,100%,[0],[0],[0],[0],[0]
5,dien thoai,iPhone 15 Pro Max 256GB/512GB/1T [Futureworld-...,https://www.lazada.vn/products/iphone-15-pro-m...,32.126.000 ₫,,28 Đã bán,(1),Hồ Chí Minh,38.470.000 ₫,18.300 ₫,30 Ngày Trả Hàng Miễn Phí,98%,100%,100%,[0],[0],[0],[0],[1]
6,dien thoai,[GIÁ SỐC CHỈ 6.6 - VOUCHER 7TR] [TẶNG Buds2 Pr...,https://www.lazada.vn/products/gia-soc-chi-66-...,31.990.000 ₫,6% Off,1.3K Đã bán,(305),Hồ Chí Minh,33.990.000 ₫,16.500 ₫,30 Ngày Trả Hàng Miễn Phí,91%,99%,100%,[2],[0],[1],[0],[302]
7,dien thoai,Điện thoại Samsung Galaxy S24 Ultra 12GB/512GB...,https://www.lazada.vn/products/dien-thoai-sams...,31.590.000 ₫,16% Off,21 Đã bán,(2),Hồ Chí Minh,37.490.000 ₫,,30 Ngày Trả Hàng Miễn Phí,90%,Không đủ thông tin,100%,[0],[0],[0],[0],[3]
8,dien thoai,[Trả góp 0%] Apple iPhone 15 Pro Max 256GB Chí...,https://www.lazada.vn/products/tra-gop-0-apple...,30.090.000 ₫,15% Off,82 Đã bán,(10),Hà Nội,35.990.000 ₫,38.000 ₫,30 Ngày Trả Hàng Miễn Phí,99%,100%,85%,[0],[0],[0],[0],[10]
9,dien thoai,Điện thoại thông minh Xiaomi 14 Ultra (16+512GB),https://www.lazada.vn/products/dien-thoai-thon...,29.517.000 ₫,11% Off,6 Đã bán,,Hồ Chí Minh,32.990.000 ₫,18.300 ₫,30 Ngày Trả Hàng Miễn Phí,99%,100%,100%,[0],[0],[0],[0],[0]
10,dien thoai,[6.6 SALE TO] iPhone 15 Pro Max - Hàng Chính H...,https://www.lazada.vn/products/66-sale-to-ipho...,29.590.000 ₫,15% Off,11.2K Đã bán,(2631),Hồ Chí Minh,34.999.000 ₫,5.500 ₫,30 Ngày Trả Hàng Miễn Phí,94%,98%,100%,[30],[3],[8],[9],[2578]
