# Level 3 Data parsing from OLX nedvizhimost and writing to google sheet

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

from dataclasses import dataclass, fields
from urllib.parse import urljoin

BASE_URL = "https://www.olx.ua/uk/nedvizhimost/"

# Model for estate parsing

In [95]:
@dataclass
class Estate:
    title: str
    price: str
    floor: int
    storey: int
    locality: str
    area: int | float


PRODUCT_FIELDS = [field.name for field in fields(Estate)]

# getting links to all estates

In [3]:
def get_links_on_page(driver: webdriver) -> list:
    elements = driver.find_elements(By.CLASS_NAME, "css-rc5s2u")
    return [element.get_attribute("href") for element in elements]


def get_estate_links() -> list:
    links = []
    with webdriver.Chrome() as driver:
        driver.get(BASE_URL)
        pages_count = int(driver.find_elements(By.CLASS_NAME, "pagination-item")[-1].text)
        links.extend(get_links_on_page(driver))
        for page in range(2, pages_count + 1):
            driver.get(urljoin(BASE_URL, f"?page={page}"))
            links.extend(get_links_on_page(driver))
    return links


estate_detail_links = get_estate_links()

# function for pursing single estate

In [40]:
def get_block_element(estate: webdriver, word: str, count_word: int) -> str:
    elements = estate.find_elements(
        By.XPATH,
        f'//li[@class="css-1r0si1e" and starts-with(p, "{word}")]'
    )

    if elements:
        element = elements[0].text.split(" ")[count_word]
    else:
        element = "?"

    return element


def parse_single_estate(estate: webdriver) -> Estate:
    floor = get_block_element(estate, "Поверх:", 1)
    storey = get_block_element(estate, "Поверховість:", 1)
    area = get_block_element(estate, "Загальна площа:", 2)

    return Estate(
        title=estate.find_element(By.CLASS_NAME, "css-1juynto").text,
        price=estate.find_element(By.CLASS_NAME, "css-12vqlj3").text,
        floor=floor,
        storey=storey,
        locality=estate.find_element(By.CLASS_NAME, "css-1cju8pu").text,
        area=area
    )

# getting all estates

In [101]:
from selenium.common import NoSuchElementException


def get_estates() -> [Estate]:
    all_estates = []
    with webdriver.Chrome() as driver:
        for link in estate_detail_links[:50]:
            try:
                driver.get(link)
                all_estates.append(parse_single_estate(driver))
            except NoSuchElementException:
                print(f"Link {link} is not available")

    return all_estates


estates = get_estates()
estates

Link https://www.olx.ua/d/uk/obyavlenie/2-h-kmnatna-kvartira-IDSkw1A.html is not available


[Estate(title='Оренда будинку 230кв.м.12сот, р-н Сирець, поруч ЖК Сирецькі Сади', price='1 500 $', floor='?', storey='4', locality='Київ, Подільський', area='233'),
 Estate(title='Квартира с новым ремонтом - Площадь Конституции', price='450 грн.', floor='3', storey='?', locality='Харків, Київський', area='30'),
 Estate(title='Лучший хостел Киева . Посуточно , понедельно , помесячно . Долгосрочно', price='60 грн.', floor='?', storey='?', locality='Київ, Святошинський', area='?'),
 Estate(title='Продам 3к квартира на пр. Гагарина Соборный район', price='43 000 $', floor='5', storey='5', locality='Дніпро, Соборний', area='57'),
 Estate(title='Сдам дом, дачу на длительный срок.', price='14 000 грн.', floor='?', storey='2', locality='Київ, Дарницький', area='80'),
 Estate(title='Дача для переселенцев', price='100 $', floor='?', storey='2', locality='Суми,', area='50'),
 Estate(title='Сдам 2-х уровневую студию на Одесской', price='5 500 грн.', floor='4', storey='4', locality='Харків, Слобідс

# Implemented get links with beautiful soup, because it faster

In [57]:
import requests
from bs4 import BeautifulSoup


def soup_get_links_and_locations_on_page(soup: BeautifulSoup) -> list:
    return [
        (element["href"], element.select_one(".css-1a4brun").text.split(" -")[0]) 
        for element in soup.select(".css-rc5s2u")
    ]


def soup_get_estate_links() -> list:
    links_and_locs = []
    page = requests.get(BASE_URL).content
    soup = BeautifulSoup(page, "html.parser")
    pages_count = int(soup.select(".pagination-item")[-1].text)
    links_and_locs.extend(soup_get_links_and_locations_on_page(soup))
    for page in range(2, pages_count + 1):
        page = requests.get(urljoin(BASE_URL, f"?page={page}")).content
        soup = BeautifulSoup(page, "html.parser")
        links_and_locs.extend(soup_get_links_and_locations_on_page(soup))

    return links_and_locs

soup_estate_detail = soup_get_estate_links()

# get info with beautifulsoup for fast works

In [63]:
def soup_get_block_element(estate: BeautifulSoup, word: str, count_word: int) -> str:
    elements = estate.select(
        f'.css-1r0si1e > p:contains("{word}")'
    )

    if elements:
        element = elements[0].text.split(" ")[count_word]
    else:
        element = "?"

    return element


def soup_parse_single_estate(estate: BeautifulSoup, location: str) -> Estate:
    floor = soup_get_block_element(estate, "Поверх:", 1)
    storey = soup_get_block_element(estate, "Поверховість:", 1)
    area = soup_get_block_element(estate, "Загальна площа:", 2)

    return Estate(
        title=estate.select_one(".css-1juynto").text,
        price=estate.select_one(".css-12vqlj3").text,
        floor=floor,
        storey=storey,
        locality=location,
        area=area
    )


def soup_get_estates(soup_estates: list) -> [Estate]:
    all_estates = []
    for element in soup_estates:
        try:
            page = requests.get(urljoin(BASE_URL, element[0])).content
            soup = BeautifulSoup(page, "html.parser")
            location = element[1]
            all_estates.append(soup_parse_single_estate(soup, location))
        except AttributeError:
            print(f"Link https://www.olx.ua/{element[0]} is not available")

    return all_estates


soup_estates_info = soup_get_estates(soup_estate_detail)
soup_estates_info

Link /d/uk/obyavlenie/odnokmnatna-kvartira-rpn-dk-sinergya-IDUmlmc.html is not available
Link /d/uk/obyavlenie/odnokmnatna-kvartira-rpn-dk-sinergya-IDUmlmc.html is not available
Link /d/uk/obyavlenie/dom-350-kv-m-romankov-rayon-solnechnaya-dolina-uchastok-25-sotok-IDLRZ56.html is not available


[Estate(title='Сдам квартиру м. Ботанический сад ул. Авиахимическая, 7', price='3 700 грн.', floor='3', storey='8', locality='Харків, Шевченківський ', area='20'),
 Estate(title='Посуточно квартира  Таирово, Левитана, пр-к Небесной Сотни, ТРЦ Сити-ц', price='750 грн.', floor='14', storey='?', locality='Одеса, Київський', area='45'),
 Estate(title='Продается ділянка 5 сот. 10 км від Киева. 800м. Сільпо', price='16 500 $', floor='?', storey='?', locality='Білогородка', area='?'),
 Estate(title='Реально є! Оренда 1 к. видової квартири на березі Дніпра', price='9 000 грн.', floor='7', storey='16', locality='Черкаси', area='37'),
 Estate(title='Продаємо офіс в центрі Києва', price='105 000 $', floor='1', storey='3', locality='Київ, Голосіївський', area='45'),
 Estate(title='Хостел правый берег Киев! Комната на 4 человека!', price='170 грн.', floor='?', storey='?', locality='Київ, Печерський', area='?'),
 Estate(title='Оренда трикімнатної квартири. Локація Фединця .', price='10 000 грн.', fl

# writing estates to google sheet

In [99]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials


def connection() -> gspread:
    creds = ServiceAccountCredentials.from_json_keyfile_name("keys.json")
    file = gspread.authorize(creds)
    workbook = file.open("Parse_estates")
    sheet = workbook.sheet1
    return sheet


def write_to_google_sheet(data: [Estate]):
    sheet = connection()
    sheet.clear()

    header_row = PRODUCT_FIELDS
    sheet.append_row(header_row)

    rows_to_insert = [[getattr(estate, field) for field in header_row] for estate in data]
    sheet.insert_rows(rows_to_insert, row=2)

write_to_google_sheet(soup_estates_info)