### CRAWL DATA "Tivi" FROM LAZADA

### 1. Setting selenium and initialize Chrome

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from time import sleep
import random
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm

In [2]:
#URL Lazada: tivi
main_link = 'https://www.lazada.vn/catalog/?q=tivi'
path = 'chromedriver.exe'

# Customize Chrome browser settings
chrome_options = Options()                                     
chrome_options.add_argument('--no-sandbox')                     #Them tuy chon de chay Chrome ko co sandbox, thuong duoc su dung de tranh cac van de ve quyen truy cap he thong
chrome_options.add_argument('--disable-notifications')          #Tat thong bao tu trinh duyet
chrome_options.add_argument('--disable-infobars')               #Tat thanh thong tin cua trinh duyet

service = Service(executable_path=path)
driver = webdriver.Chrome(service=service, options=chrome_options)

#Open the Lazada web
driver.get(main_link)

### 2. Crawl data on 1 page

##### Outside page

In [3]:
#Crawl the information displayed on the outside of the page

def get_Data_On_Page():
    #The information is definitely there
    Type = [elem.get_attribute("value") for elem in driver.find_elements(By.CSS_SELECTOR, ".search-box__input--O34g")]
    Title = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")]
    Link = [elem.get_attribute('href') for elem in driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")]
    Price_sale = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".ooOxS")]

    #The information may or may not be available
    #If not, assign None
    Sale_off, Total_sold, Preview = [], [], []
    for i in range(1, len(Title)+1):
        Title = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")]

        Sale_off, Total_sold, Preview = [], [], []
        for i in range(1, len(Title)+1):
            try:
                temp_sale = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[4]/span".format(i))
                Sale_off.append(temp_sale.text)
            except NoSuchElementException:
                Sale_off.append(None)

            try:
                temp_sold = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[5]/span[1]/span[1]".format(i))
                Total_sold.append(temp_sold.text)
            except NoSuchElementException:
                Total_sold.append(None)

            try:
                temp_preview = driver.find_element("xpath", "/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{}]/div/div/div[2]/div[5]/div/span".format(i))
                Preview.append(temp_preview.text)
            except NoSuchElementException:
                Preview.append(None)

    Location = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".oa6ri")]
    if len(Type) == 1:
        Type = Type * len(Title)

    data = {
        "Type": Type,
        "Title": Title,
        "Link": Link,
        "Price_sale": Price_sale,
        "Sale_off": Sale_off,
        "Total_sold": Total_sold,
        "Preview": Preview,
        "Location": Location
    }

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(data)
    return df

In [8]:
#Call the function and get the DF
tivi = get_Data_On_Page()
tivi.index = np.arange(1, len(tivi)+1)

In [9]:
tivi.head()

Unnamed: 0,Type,Title,Link,Price_sale,Sale_off,Total_sold,Preview,Location
1,tivi,[NEW MODEL 2024] QLED TIVI 4K SAMSUNG 85 INCH ...,https://www.lazada.vn/products/new-model-2024-...,64.690.000 ₫,6% Off,,,Hồ Chí Minh
2,tivi,GIAO HÀ NỘI - Smart Tivi Casper 4K 50 inch 50U...,https://www.lazada.vn/products/giao-ha-noi-sma...,5.389.000 ₫,46% Off,130 Đã bán,(6),Hà Nội
3,tivi,Thanh lý TIVI TOSHIBA 24inch đầy đủ cổng kết n...,https://www.lazada.vn/products/thanh-ly-tivi-t...,920.486 ₫,3% Off,77 Đã bán,(26),Hà Nội
4,tivi,Smart Tivi Coocaa Full HD 43 Inch 43S3U,https://www.lazada.vn/products/smart-tivi-cooc...,3.989.000 ₫,50% Off,,,Hà Nội
5,tivi,Smart Tivi Samsung 32 inch UA32T4202,https://www.lazada.vn/products/smart-tivi-sams...,4.689.000 ₫,41% Off,16 Đã bán,,Hà Nội


##### Each Product

In [19]:
#Get the link and access each product to get more features
def get_Data_Detail_1Product(link):
    driver.get(link)                            #Get link
    sleep(random.uniform(2, 5))                 #Random sleep to mimic human behavior and avoid getting blocked
    
    Price_original = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".notranslate.pdp-price.pdp-price_type_deleted.pdp-price_color_lightgray.pdp-price_size_xs")]
    Ship_price = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item.delivery-option-item_type_standard .delivery-option-item__body .delivery-option-item__shipping-fee.no-subtitle")]
    Return = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item.delivery-option-item_type_returnPolicy30 .delivery-option-item__body .delivery-option-item__info .delivery-option-item__title")]

    # Sale_rating & Ship_on_time & Chat_respone
    percentage = [elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".seller-info-value ")]
    Sale_rating = percentage[0] if len(percentage) > 0 else None
    Ship_on_time = percentage[1] if len(percentage) > 1 else None
    Chat_response = percentage[2] if len(percentage) > 2 else None


    # Cuộn trang từ từ để tải phần đánh giá sao
    while True:
        driver.execute_script("window.scrollBy(0, 400);")
        sleep(1)  # Đợi một chút để nội dung tải
        try:
            stars = driver.find_elements(By.CSS_SELECTOR, ".detail .percent")
            if len(stars) >= 5:
                break
        except NoSuchElementException:
            continue
    
    One_star, Two_star, Three_star, Four_star, Five_star = [], [], [], [], []
    stars = [elem.text for elem in stars]

    One_star.append(stars[4] if len(stars) > 4 else 'N/A')
    Two_star.append(stars[3] if len(stars) > 3 else 'N/A')
    Three_star.append(stars[2] if len(stars) > 2 else 'N/A')
    Four_star.append(stars[1] if len(stars) > 1 else 'N/A')
    Five_star.append(stars[0] if len(stars) > 0 else 'N/A')


    #Create a dictionary with the lists
    detail = {
        "Price_original": Price_original[0] if Price_original else None,
        "Ship_price": Ship_price[0] if Ship_price else None,
        "Return": Return[0] if Return else None,
        "Sale_rating": Sale_rating,
        "Ship_on_time": Ship_on_time,
        "Chat_response": Chat_response,
        "One_star": One_star,
        "Two_star": Two_star,
        "Three_star": Three_star,
        "Four_star": Four_star,
        "Five_star": Five_star
    }
    
    return detail 

In [20]:
#Initialize dictionary to store details for all products
detail_products = {
    "Price_original": [],
    "Ship_price": [],
    "Return": [],
    "Sale_rating": [],
    "Ship_on_time": [],
    "Chat_response": [],
    "One_star": [],
    "Two_star": [],
    "Three_star": [],
    "Four_star": [],
    "Five_star": []
}

#Create a progress bar to track the crawl process
for link in tqdm(tivi.Link, desc="Crawling product details"):
    details = get_Data_Detail_1Product(link)
    for key in detail_products.keys():
        detail_products[key].append(details[key])

Crawling product details: 100%|██████████| 40/40 [19:58<00:00, 29.96s/it]


In [21]:
#Add columns to dienthoai
for key, value in detail_products.items():
    tivi[key] = value

In [23]:
#Convert to csv
page_01 = tivi.to_csv("page_01.csv", index=False)
tivi

Unnamed: 0,Type,Title,Link,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Return,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
1,tivi,[NEW MODEL 2024] QLED TIVI 4K SAMSUNG 85 INCH ...,https://www.lazada.vn/products/new-model-2024-...,64.690.000 ₫,6% Off,,,Hồ Chí Minh,68.790.000 ₫,,,100%,Không đủ thông tin,100%,[0],[0],[0],[0],[0]
2,tivi,GIAO HÀ NỘI - Smart Tivi Casper 4K 50 inch 50U...,https://www.lazada.vn/products/giao-ha-noi-sma...,5.389.000 ₫,46% Off,130 Đã bán,(6),Hà Nội,9.890.000 ₫,,,100%,Không đủ thông tin,100%,[0],[0],[0],[0],[6]
3,tivi,Thanh lý TIVI TOSHIBA 24inch đầy đủ cổng kết n...,https://www.lazada.vn/products/thanh-ly-tivi-t...,920.486 ₫,3% Off,77 Đã bán,(26),Hà Nội,950.000 ₫,115.100 ₫,,91%,95%,92%,[0],[0],[0],[2],[24]
4,tivi,Smart Tivi Coocaa Full HD 43 Inch 43S3U,https://www.lazada.vn/products/smart-tivi-cooc...,3.989.000 ₫,50% Off,,,Hà Nội,7.990.000 ₫,,30 Ngày Trả Hàng Miễn Phí,Nhà bán hàng mới,Không đủ thông tin,100%,[0],[0],[0],[0],[0]
5,tivi,Smart Tivi Samsung 32 inch UA32T4202,https://www.lazada.vn/products/smart-tivi-sams...,4.689.000 ₫,41% Off,16 Đã bán,,Hà Nội,7.900.000 ₫,,,100%,Không đủ thông tin,100%,[0],[0],[0],[0],[1]
6,tivi,Tivi 32 inch Sen Việt Androi Tv độ phân giải 1...,https://www.lazada.vn/products/tivi-32-inch-se...,2.490.000 ₫,,,,Thái Bình,,141.300 ₫,,Nhà bán hàng mới,Không đủ thông tin,100%,[0],[0],[0],[0],[0]
7,tivi,"Thanh lý Smart Tivi LG 4K 43 inch có wifi,yout...",https://www.lazada.vn/products/thanh-ly-smart-...,3.565.542 ₫,4% Off,31 Đã bán,(15),Hà Nội,3.700.000 ₫,206.800 ₫,,91%,95%,92%,[1],[0],[1],[0],[14]
8,tivi,Smart Tivi Casper S Series Full HD 43 inch 43F...,https://www.lazada.vn/products/smart-tivi-casp...,4.689.000 ₫,48% Off,9 Đã bán,,Hà Nội,9.090.000 ₫,,,100%,Không đủ thông tin,100%,[0],[0],[0],[0],[0]
9,tivi,"Smart Tivi Sony 42 inch có wifi,youtobe...",https://www.lazada.vn/products/smart-tivi-sony...,3.003.074 ₫,2% Off,21 Đã bán,(17),Hà Nội,3.070.000 ₫,198.200 ₫,,91%,95%,92%,[1],[0],[1],[1],[14]
10,tivi,Tivi Mới Cũ dùng cho khách sạn - văn phòng,https://www.lazada.vn/products/tivi-moi-cu-dun...,950.000 ₫,,,,Hà Nội,,198.200 ₫,,86%,97%,66%,[0],[0],[0],[0],[0]
