In [None]:
# Using Selenium to crawl data, as we using Edge, we will use webdriver of Edge

from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
import time
class SeleniumCrawler:
    def __init__(self) -> None:
        self.options = Options()

        # window size of selenium
        self.options.add_argument("--window-size=128,70")
        self.options.add_argument("--headless")
        # Disable GPU
        self.options.add_argument("--disable-gpu")
        # Using header to avoid being blocked by web
        self.options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
        self.driver = webdriver.Edge(options=self.options)

    def crawl(self, url: str, selector: str, script: str = None, sleep: int = 1):
        # Running URL
        self.driver.get(url)
        time.sleep(sleep)
        if script != None:
            self.driver.execute_script(script)
        # Grabbing element with css selector which match
        els = self.driver.find_elements(By.CSS_SELECTOR, selector)
        return els

    # Similar to crawling above, with the task of crawling multiple selectors
    def crawl_muti_selectors(self, url: str, selectors: list[str], script: str = None, sleep: int = 1):
        self.driver.get(url)
        time.sleep(sleep)
        if script != None:
            self.driver.execute_script(script)
        els = []
        for selector in selectors:
            els.append(self.driver.find_elements(By.CSS_SELECTOR, selector))
        return els

    # Similar to the crawl above, the task can be to crawl multiple selectors, but all elements will be joined together into a list.
    def crawl_squence_selectors(self, url: str, selectors: list[str], script: str = None, sleep: int = 1):
        self.driver.get(url)
        time.sleep(sleep)
        if script != None:
            self.driver.execute_script(script)
        els = []
        for selector in selectors:
            els += self.driver.find_elements(By.CSS_SELECTOR, selector)
        return els
    # Turn off driver
    def quit(self):
        self.driver.quit()

In [None]:
## link https://www.nhatot.com/mua-ban-nha-dat-tp-ho-chi-minh?page=2

import pandas as pd
from multiprocessing import Queue
from selenium.webdriver.common.by import By
from threading import Thread

# Additional information about elements to crawl
class CrawledElementInfo:
    #selector of element with neccessary information
    container_selector = "div.AdParam_adParamContainerVeh__Vz4Zt > div"
    #format for selector of label contains the necessary information.
    selector_format_value = "div.AdParam_adParamContainerVeh__Vz4Zt > div:nth-child({index}) > div > div.media-body.media-middle > span > span.AdParam_adParamValue__IfaYa"
    #format for the value selector contains the necessary information.
    selector_format_label = "div.AdParam_adParamContainerVeh__Vz4Zt > div:nth-child({index}) > div > div.media-body.media-middle > span > span:nth-child(1)"
    #selector of address
    address = "span.fz13"
    #selector of price
    price = "span.AdDecriptionVeh_price__u_N83 > span:nth-child(1)"
    #lable of area
    area_label = "Diện tích"
    #label of bedroom
    bedroom_num_label = "Số phòng ngủ"
    #label of floor
    floor_num_label = "Tổng số tầng"
    #label of floor
    legal_document_label = "Giấy tờ pháp lý"
    #label of building_type
    building_type_label = "Loại hình nhà ở"

# Function to crawl product's url in one home page.
def crawl_url_multithread(input: Queue,output: Queue, page_url: str):
    # When there are still other pages to crawl
    while not input.empty():
        # Retreive page number
        page = input.get()
        crawler = SeleniumCrawler()
        # crawl
        containers = crawler.crawl(page_url.format(page=page), "a.AdItem_adItem__gDDQT")
        urls = []
        # Add url into output
        for container in containers:
            building_detail_url = container.get_attribute("href")
            output.put(building_detail_url)
            urls.append(building_detail_url)
        crawler.quit()
        print(f"Get {len(urls)} urls in page {page}...")

# Function to crawl product's information in a product
def crawl_with_process(input: Queue, output: Queue):
    while not input.empty():
        # Data column
        data = {
            "Giá nhà": [None],
            "Địa chỉ": [None],
            "Diện tích": [None],
            "Số phòng ngủ": [None],
            "Tổng số tầng": [None],
            "Loại hình nhà ở": [None],
            "Giấy tờ pháp lý": [None],
            "url": [None],
        }
        # Find url from input data
        url = input.get()
        data["url"] = url
        crawler = SeleniumCrawler()
        crawler.driver.get(url)
        # Find element of address
        address = crawler.driver.find_elements(
            By.CSS_SELECTOR, CrawledElementInfo.address
        )
        # Find element of "không tìm thấy sản phẩm"
        not_found = crawler.driver.find_elements(By.CSS_SELECTOR, "div.NotFound_content__KtIbC")
        # If there are none of above elements, refresh the page
        while (len(address) == 0) and (len(not_found) == 0):
            print(f"Try again {url}...")
            crawler.driver.refresh()
            address = crawler.driver.find_elements(
                By.CSS_SELECTOR, CrawledElementInfo.address
            )
            not_found = crawler.driver.find_elements(By.CSS_SELECTOR, "div.NotFound_content__KtIbC")
        # If there are no information about the products, skip the product
        if len(not_found) != 0:
            print(f"Not found {url}...")
            crawler.quit()
            continue
        # If there are informations, lưu lại thông tin address
        if len(address) != 0:
            data["Địa chỉ"][0] = address[0].text.split("\n")[0]
        # Crawl price and save price
        price = crawler.driver.find_elements(
            By.CSS_SELECTOR, CrawledElementInfo.price
        )
        if len(price) != 0:
            data["Giá nhà"][0] = price[0].text.split("-")[0].strip()
        # Count the structured data
        info_num = len(
            crawler.driver.find_elements(
                By.CSS_SELECTOR, CrawledElementInfo.container_selector
            )
        )
        # With every strutured data
        for i in range(info_num):
            # Crawl label
            label_el = crawler.driver.find_element(
                By.CSS_SELECTOR,
                CrawledElementInfo.selector_format_label.format(index=i + 1),
            )
            # Crawl value
            value = crawler.driver.find_element(
                By.CSS_SELECTOR,
                CrawledElementInfo.selector_format_value.format(index=i + 1),
            )
            # Preprocessing the data
            label = label_el.text.split(":")[0]
            # If the label belongs to the information that needs to be retrieved, then save the value
            if label in [
                CrawledElementInfo.area_label.strip(),
                CrawledElementInfo.bedroom_num_label.strip(),
                CrawledElementInfo.floor_num_label.strip(),
                CrawledElementInfo.legal_document_label.strip(),
                CrawledElementInfo.building_type_label.strip(),
            ]:
                data[label][0] = value.text

        crawler.quit()
        # Save into output
        output.put(data)
        df = pd.DataFrame(data)
        # insert datas into sheet named file data.csv
        df.to_csv("data.csv", mode="a", encoding="utf-8-sig", header=False, index=False)
        print(f"Crawled: {output.qsize()}.{data}")

# Main processing function
def get_data(url: str, from_page: int, to_page: int = 100):
    # List datas
    data = {
        "Giá nhà": [],
        "Địa chỉ": [],
        "Diện tích": [],
        "Số phòng ngủ": [],
        "Tổng số tầng": [],
        "Loại hình nhà ở": [],
        "Giấy tờ pháp lý": [],
        "url": [],
    }
    # Create sheet data.csv
    pd.DataFrame(data).to_csv("data.csv", encoding="utf-8-sig", index=False)
    input = Queue()
    sub_input = Queue()
    output = Queue()
    # Load pages url needs to be crawled into sub_input
    for i in range(to_page - from_page + 1):
        sub_input.put(i+1)
    crawl_url_threads = []
    # Crawl url using 4 thread to speedup crawl. Thread is different from process, 1 process can have many threads.
    for _ in range(4):
        crawl_url_thread = Thread(target=crawl_url_multithread, args=(sub_input,input, url))
        crawl_url_threads.append(crawl_url_thread)
    # Start running threads
    for crawl_url_thread in crawl_url_threads:
        crawl_url_thread.start()
    # Waiting for threads to finish running
    for crawl_url_thread in crawl_url_threads:
        crawl_url_thread.join()
    print("Detect {} buidling".format(input.qsize()))
    processes = []

    ## Run 12 threads
    for _ in range(12):
        process = Thread(target=crawl_with_process, args=(input, output))
        processes.append(process)
    # Start running threads
    for process in processes:
        process.start()
    # Waiting for threads to finish running
    for process in processes:
        process.join()
    # Load datas into an array from output and return to that array
    while not output.empty():
        crawled_data = output.get()
        for key in crawled_data.keys():
            data[key].append(crawled_data[key])
    output.close()
    input.close()
    return data



In [None]:

if __name__ == "__main__":
    url = "https://www.nhatot.com/mua-ban-nha-dat-tp-ho-chi-minh?page={page}"
    datas = get_data(url, 1, 200)



Get 20 urls in page 3...
Get 22 urls in page 2...
Get 20 urls in page 4...
Get 25 urls in page 1...
Get 20 urls in page 5...
Get 20 urls in page 6...
Get 20 urls in page 7...
Get 20 urls in page 8...
Get 20 urls in page 9...
Get 20 urls in page 10...
Get 20 urls in page 11...
Get 20 urls in page 12...
Get 20 urls in page 13...
Get 20 urls in page 14...
Get 20 urls in page 16...
Get 20 urls in page 15...
Get 20 urls in page 17...
Get 20 urls in page 19...
Get 20 urls in page 18...
Get 20 urls in page 20...
Get 20 urls in page 21...
Get 20 urls in page 22...
Get 20 urls in page 23...
Get 20 urls in page 24...
Get 20 urls in page 25...
Get 20 urls in page 26...
Get 20 urls in page 27...
Get 20 urls in page 28...
Get 20 urls in page 29...
Get 20 urls in page 30...
Get 20 urls in page 31...
Get 20 urls in page 32...
Get 20 urls in page 33...
Get 20 urls in page 34...
Get 20 urls in page 35...
Get 20 urls in page 36...
Get 20 urls in page 37...
Get 20 urls in page 38...
Get 20 urls in page 3