# CRAWL DATA FROM LAZADA

## 1. Setting selenium and initialize Chrome


In [17]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from time import sleep
import random
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

In [18]:
main_link = 'https://www.lazada.vn/catalog/?q=laptop'
path = 'chromedriver.exe'

# # Customize chrome display
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument('--disable-infobars')

service = Service(executable_path=path)
driver = webdriver.Chrome(service=service, options=chrome_options)

driver.get(main_link)

In [19]:
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options = options)
    return driver

In [20]:
driver = web_driver()
driver.get('https://www.lazada.vn/catalog/?q=laptop')
sleep(random.randint(5,10))

## 2 Crawl data on first page

In [22]:
# ================================ GET links/title
elems = driver.find_elements(By.CSS_SELECTOR , ".RfADt [href]")
Title = [elem.text for elem in elems]
Links = [elem.get_attribute('href') for elem in elems]

# ================================ GET type
elems = driver.find_elements(By.CSS_SELECTOR , ".JrAyI")
Type = [elem.text for elem in elems]
if len(Type) == 1:
    Type = Type * len(Title)

# ================================ GET  price sale
elems = driver.find_elements(By.CSS_SELECTOR , ".ooOxS")
Price_sale = [elem.text for elem in elems]

# ===# ============================ GET sale off 
elems = driver.find_elements(By.CSS_SELECTOR , ".WNoq3")
Sale_off = [elem.text for elem in elems]

# ================================= GET Location
elems = driver.find_elements(By.CSS_SELECTOR , ".oa6ri")
Location = [elem.text for elem in elems]

# ================================== Create data frame
df1 = pd.DataFrame(list(zip( Type, Title, Links, Price_sale, Sale_off, Location)),
                   columns = ['Type', 'Title', 'Link', 'Price_sale', 'Sale_off', 'Location'])
df1['Index_']= np.arange(1, len(df1) + 1)
print(df1)

Empty DataFrame
Columns: [Type, Title, Link, Price_sale, Sale_off, Location, Index_]
Index: []


In [11]:
# =================================== GET total sold/preveiw
Total_sold, Preview, Index_ = [], [], []
for i in range(1, len(Title)+1):
    try:
        total = driver.find_element("xpath", f"/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{i}]/div/div/div[2]/div[5]/span[1]/span[1]")
        Total_sold.append(total.text)
    except NoSuchElementException:
        Total_sold.append(None)  # Đúng danh sách và phần tử
    try:
        prev = driver.find_element("xpath", f"/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{i}]/div/div/div[2]/div[5]/div/span")
        Preview.append(prev.text)
    except NoSuchElementException:
        Preview.append(None)  # Đúng danh sách và phần tử
    try:
        Index_.append(i)
    except NoSuchElementException:
        Preview.append(None)  # Đúng danh sách và phần tử

df2 = pd.DataFrame(list(zip(Index_ , Total_sold, Preview)), columns = ['Index_', 'Total_sold', 'Preview'])
print(df2)

In [19]:
# =================== GET price original/ship price/return/sale rating/ship on time/ chat response
Price_original, Ship_price, Return_exchange, Sale_rating, Ship_on_time, Chat_response = [], [], [], [], [], []
 
for link in Links:
    driver.get(link)

    # ================================ GET price original
    elems = driver.find_elements(By.CSS_SELECTOR, ".origin-block .notranslate")
    one_price_original = [elem.text for elem in elems]
    Price_original.append(one_price_original)

    # ================================ GET ship price
    elems = driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item__shipping-fee")
    one_ship_prices = [elem.text for elem in elems]
    # if len(one_ship_prices) != 0: 
    one_ship_price = one_ship_prices[0]
    Ship_price.append(one_ship_price)

    # ================================ GET return
    elems = driver.find_elements(By.CSS_SELECTOR, ".warranty__option-item .delivery-option-item__body .delivery-option-item__info .delivery-option-item__title")
    one_return_exchanges = [elem.text for elem in elems]
    one_return_exchange = one_return_exchanges[0]
    Return_exchange.append(one_return_exchange)

    # ================================ GET sale rating/ shop on tim/ chat response
    elems = driver.find_elements(By.CSS_SELECTOR, ".info-content .seller-info-value")
    percent_data = [elem.text for elem in elems]

    for i, percent in enumerate(percent_data):
        if i == 0:
            one_sale_rating = percent # sale rating
        elif i == 1:
            one_ship_on_time = percent # shop on time 
        elif i == 2:
            one_chat_response = percent # chat response

    Sale_rating.append(one_sale_rating)
    Ship_on_time.append(one_ship_on_time)
    Chat_response.append(one_chat_response)

In [20]:
# =============================== GET star      
One_star, Two_star, Three_star, Four_star, Five_star = [], [], [], [], []
count = 0

for link in Links: 
    driver.get(link)

    # ================================ GET star data
    elems = driver.find_elements(By.CSS_SELECTOR, ".percent")
    stars = [elem.text for elem in elems]
    print(count)
    # Kiểm tra và thêm các giá trị sao một cách an toàn
    One_star.append(stars[4] if len(stars) > 4 else 'N/A')
    Two_star.append(stars[3] if len(stars) > 3 else 'N/A')
    Three_star.append(stars[2] if len(stars) > 2 else 'N/A')
    Four_star.append(stars[1] if len(stars) > 1 else 'N/A')
    Five_star.append(stars[0] if len(stars) > 0 else 'N/A')
    count += 1
    if count >= 40:
        break

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [22]:
df4 = pd.DataFrame(list(zip(Price_original , Ship_price, Return_exchange, Sale_rating, Ship_on_time, Chat_response, One_star, Two_star, Three_star, Four_star, Five_star)), 
                   columns = ['price_original', 'ship_price','Return', 'Sale_rating', 'Ship_on_time', 'Chat_response', 'One_star', 'Two_star', 'Three_star', 'Four_star', 'Five_star'])
df4['Index_']= np.arange(1, len(df4) + 1)

In [None]:
for i in range(2, 51):
    driver.get(f'https://www.lazada.vn/catalog/?page={i}&q=laptop')
    # ================================ GET links/title
    elems = driver.find_elements(By.CSS_SELECTOR, ".RfADt [href]")
    new_titles = [elem.text for elem in elems]
    new_links = [elem.get_attribute('href') for elem in elems]
    if new_titles and new_links:
        for i in new_titles:
            Title.append(i)
        for i in new_links:
            Links.append(i)

    # ================================ GET type
    elems = driver.find_elements(By.CSS_SELECTOR, ".JrAyI")
    new_types = [elem.text for elem in elems]
    if new_types:  
        if len(new_types) == 1:
            new_types = new_types * len(new_titles)  
        for i in new_types:
            Type.append(i)

    # ================================ GET price sale
    elems = driver.find_elements(By.CSS_SELECTOR, ".ooOxS")
    new_price_sales = [elem.text for elem in elems]
    if new_price_sales:  
        for i in new_price_sales:
            Price_sale.append(i)

    # ================================ GET sale off
    elems = driver.find_elements(By.CSS_SELECTOR, ".WNoq3")
    new_sale_offs = [elem.text for elem in elems]
    if new_sale_offs:  
        for i in new_sale_offs:
            Sale_off.append(i)
            
    # ================================ GET Location
    elems = driver.find_elements(By.CSS_SELECTOR , ".oa6ri ")
    new_location = [elem.text for elem in elems]
    if new_location: 
        for i in new_location:
            Location.append(i)  

In [None]:
for i in range(2, 51):
    # ================================ GET total sold/ preview
    # Total_sold, Preview, Index_ = [], [], []
    for i in range(1, len(Title) + 1):
        try:
            total = driver.find_element("xpath", f"/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{i}]/div/div/div[2]/div[5]/span[1]/span[1]")
            Total_sold.append(total.text)
        except NoSuchElementException:
            Total_sold.append(None)  # Đúng danh sách và phần tử
        try:
            prev = driver.find_element("xpath", f"/html/body/div[3]/div/div[2]/div[1]/div/div[1]/div[2]/div[{i}]/div/div/div[2]/div[5]/div/span")
            Preview.append(prev.text)
        except NoSuchElementException:
            Preview.append(None)  # Đúng danh sách và phần tử
        try:
            Index_.append(i)
        except NoSuchElementException:
            Preview.append(None)  # Đúng danh sách và phần t


In [None]:
for i in range(2, 51):
    # ================================= GET link item
    # Price_original, Ship_price, Return_exchange, Sale_rating, Ship_on_time, Chat_response = [], [], [], [], [], []
 
    for link in Links:
        driver.get(link)

        # ================================ GET price original
        elems = driver.find_elements(By.CSS_SELECTOR, ".origin-block .notranslate")
        one_price_original = [elem.text for elem in elems]
        Price_original.append(one_price_original)

        # ================================ GET ship price
        elems = driver.find_elements(By.CSS_SELECTOR, ".delivery-option-item__shipping-fee")
        one_ship_prices = [elem.text for elem in elems]
        if len(one_ship_prices) != 0: 
            one_ship_price = one_ship_prices[0]
        Ship_price.append(one_ship_price)

        # ================================ GET return
        elems = driver.find_elements(By.CSS_SELECTOR, ".warranty__option-item .delivery-option-item__body .delivery-option-item__info .delivery-option-item__title")
        one_return_exchanges = [elem.text for elem in elems]
        if len(one_return_exchanges) != 0:
            one_return_exchange = one_return_exchanges[0]
        Return_exchange.append(one_return_exchange)

        # ================================ GET sale rating/ shop on tim/ chat response
        elems = driver.find_elements(By.CSS_SELECTOR, ".info-content .seller-info-value")
        percent_data = [elem.text for elem in elems]

        for i, percent in enumerate(percent_data):
            if i == 0:
                one_sale_rating = percent # sale rating
            elif i == 1:
                one_ship_on_time = percent # shop on time 
            elif i == 2:
                one_chat_response = percent # chat response

        Sale_rating.append(one_sale_rating)
        Ship_on_time.append(one_ship_on_time)
        Chat_response.append(one_chat_response)

In [None]:
for i in range(2, 51):
    # =================================
    count = 0

    for link in Links: 
        driver.get(link)

        # ================================ GET star data
        elems = driver.find_elements(By.CSS_SELECTOR, ".percent")
        stars = [elem.text for elem in elems]
        print(count)
        # Kiểm tra và thêm các giá trị sao một cách an toàn
        One_star.append(stars[4] if len(stars) > 4 else 'N/A')
        Two_star.append(stars[3] if len(stars) > 3 else 'N/A')
        Three_star.append(stars[2] if len(stars) > 2 else 'N/A')
        Four_star.append(stars[1] if len(stars) > 1 else 'N/A')
        Five_star.append(stars[0] if len(stars) > 0 else 'N/A')
        count += 1
        if count >= 40:
            break

In [13]:
df1 = pd.DataFrame(list(zip( Type,Title, Links, Price_sale, Sale_off, Location)),
                   columns = ['Type', 'Title', 'Link_item', 'Price_sale', 'Sale_off', 'Location'])
df1['index_']= np.arange(1, len(df1) + 1)

In [14]:
print(df1)

        Type                                              Title  \
0     laptop  Laptop Dell Inspiron 7567 máy tính giá rẻ(Core...   
1     laptop  Máy Tính Xách Tay Bàn Phím Bao Gồm, Màng Bảo V...   
2     laptop  hp 8440P i5 , Ram 4G , ssd 240GB , Nhập Khẩu N...   
3     laptop  Laptop cũ nhiều lựa chọn Pentium đến i3 i5 i7 ...   
4     laptop  Laptop core 2 giá rẻ, ram 4G, HDD 250G, đầy đủ...   
...      ...                                                ...   
1995  laptop  Mới - Bộ vệ sinh Laptop, 18 in 1 Cao Cấp Bộ ...   
1996  laptop  Giá đỡ Laptop MC Bàn máy tính bảng N3 đứng di ...   
1997  laptop  Lenovo Máy tính xách tay Intel Core i7 5500U L...   
1998  laptop              Đế tản nhiệt laptop bằng 4 nút cao su   
1999  laptop  Laptop Lenovo core i5 4210M Ram 8GB SSD 240GB ...   

                                              Link_item   Price_sale Sale_off  \
0     https://www.lazada.vn/products/laptop-dell-ins...  3.200.000 ₫  50% Off   
1     https://www.lazada.vn/produ

In [15]:
df1.to_csv('df1_data_pages_laptop.csv', encoding='utf-8', index=False)