In [1]:
import os
import time
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup

# To resolve the issue of "Element not interactable" error
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [8]:
def initialize_browser():
    options = webdriver.FirefoxOptions()
    browser = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)
    return browser

# def load_home_page(browser, url):
#     browser.get(url)
#     time.sleep(2) # Necessary for the cookies pop-up to load
#     try:
#         cookie_button = browser.find_element(By.ID, "onetrust-accept-btn-handler")
#         cookie_button.click()
#     except NoSuchElementException:
#         print("No cookie pop-up found. Continuing without accepting cookies.")
#     return BeautifulSoup(browser.page_source, 'html.parser')

def load_home_page(browser, url, is_cookies_accepted):
    browser.get(url)
    if not is_cookies_accepted:
        try:
            # Wait up to 10 seconds for the cookie button to appear
            cookie_button = WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
            )
            cookie_button.click()
        except NoSuchElementException:
            print("No cookie pop-up found. Continuing without accepting cookies.")
    return BeautifulSoup(browser.page_source, 'html.parser')

def extract_ads_soup(home_page_soup):
    return home_page_soup.find_all('section', class_='e1tblfro1 css-hqx1d9 e12fn6ie0')

def process_listing(browser, ad_url_fragment, domain_otoDom):
    ad_url = domain_otoDom + ad_url_fragment.find('a')['href']
    if 'otodom.pl/pl/oferta' in ad_url:
        return extract_listing_details(browser, ad_url)
    return None

def extract_listing_details(browser, ad_url):
    time_scraping = []
    map_link = []
    price = []
    title = []
    surface_area = []
    num_of_room = []
    floor = []
    form_of_property = []
    finish_condition = []
    balcony_garden_terrace = []
    parking_space = []
    heating = []
    ad_link = []
    otodom_id = []
    
    browser.execute_script("window.open('');")
    browser.switch_to.window(browser.window_handles[1])
    browser.get(ad_url)

    time_scraping.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    ad_link.append(ad_url)
    ad_url_parts = ad_url.split('/')
    otodom_id.append(ad_url_parts[-1])

    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)
    ad_soup = BeautifulSoup(browser.page_source, 'html.parser')

    google_map_elem = ad_soup.find('a', title="Zgłoś do Google błędy na mapie drogowej lub na zdjęciach.")
    wait = WebDriverWait(browser, 10)
    map_button = wait.until(EC.element_to_be_clickable((By.ID, 'map')))
    map_button = browser.find_element(By.ID, "map")
    map_button.click()
    while google_map_elem is None:
        time.sleep(2)
        ad_resoup = BeautifulSoup(browser.page_source, 'html.parser')
        google_map_elem = ad_resoup.find('a', title="Zgłoś do Google błędy na mapie drogowej lub na zdjęciach.")
        if google_map_elem:
            break
    map_link.append(google_map_elem['href'])
    
    price_element = ad_soup.find('strong', class_='css-t3wmkv e9aa0kv0')
    if price_element:
        price.append(price_element.text.strip())
    else:
        ask_price_button = ad_soup.find('button', class_='e13hy00s0 css-u9h2l8 e1rtjcnh1')
        if ask_price_button and ask_price_button.text.strip() == 'Zapytaj o cenę':
            price.append('Price on request')
        else:
            price.append('Price not available')

    title_elem = ad_soup.find('h1', {'data-cy': 'adPageAdTitle'})
    title.append(title_elem.text if title_elem else None)

    surface_area_elem = ad_soup.find('div', {'data-testid': 'table-value-area'})
    surface_area.append(surface_area_elem.text if surface_area_elem else None)

    # num_of_room_elem = ad_soup.find('a', {'data-cy': 'ad-information-link'}) # original, but has very low hit-rate
    num_of_room_elem = ad_soup.find('div', {'data-testid' : 'table-value-rooms_num'})
    num_of_room.append(num_of_room_elem.text if num_of_room_elem else None)

    floor_elem = ad_soup.find('div', {'data-testid': 'table-value-floor'})
    floor.append(floor_elem.text if floor_elem else None)

    form_of_property_elem = ad_soup.find('div', {'data-testid': 'table-value-building_ownership'})
    form_of_property.append(form_of_property_elem.text if form_of_property_elem else None)

    finish_condition_elem = ad_soup.find('div', {'data-testid': 'table-value-construction_status'})
    finish_condition.append(finish_condition_elem.text if finish_condition_elem else None)

    balcony_garden_terrace_elem = ad_soup.find('div', {'data-testid': 'table-value-outdoor'})
    balcony_garden_terrace.append(balcony_garden_terrace_elem.text if balcony_garden_terrace_elem else None)

    parking_space_elem = ad_soup.find('div', {'data-testid': 'table-value-car'})
    parking_space.append(parking_space_elem.text if parking_space_elem else None)

    heating_elem = ad_soup.find('div', {'data-testid': 'table-value-heating'})
    heating.append(heating_elem.text if heating_elem else None)

    browser.close()
    browser.switch_to.window(browser.window_handles[0])
    
    return {
        'time_scraping': time_scraping,
        'otodom_id': otodom_id,
        'price': price,
        'title': title,
        'surface_area': surface_area,
        'num_of_room': num_of_room,
        'floor': floor,
        'form_of_property': form_of_property,
        'finish_condition': finish_condition,
        'balcony_garden_terrace': balcony_garden_terrace,
        'parking_space': parking_space,
        'heating': heating,
        'ad_link': ad_link,
        'map_link': map_link,
    }

def save_to_csv(data):
    df = pd.DataFrame(data)
    file_exists = os.path.isfile('otodomScrapRAWData.csv')
    df.to_csv('otodomScrapRAWData.csv', mode='a', index=False, header=not file_exists)

def main():
    domain_otodom = 'https://www.otodom.pl'
    total_pages = 588

    all_data = {
        'time_scraping': [],
        'otodom_id': [],
        'price': [],
        'title': [], # '5-pokojowe mieszkanie 115m2 + ogródek Bez Prowizji'
        'surface_area': [],
        'num_of_room': [],
        'floor': [],
        'form_of_property': [],
        'finish_condition': [],
        'balcony_garden_terrace': [],
        'parking_space': [],
        'heating': [],
        'ad_link': [],
        'map_link': [],
    }

    browser = initialize_browser()
    is_cookies_accepted = False
    for pageNum in range(155, total_pages+1):
        home_page_url = f"{domain_otodom}/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa/warszawa/warszawa?distanceRadius=25&limit=36&ownerTypeSingleSelect=ALL&by=DEFAULT&direction=DESC&viewType=listing&page={pageNum}"
        home_page_soup = load_home_page(browser, home_page_url, is_cookies_accepted)
        is_cookies_accepted = True
        one_page_ads_soup = extract_ads_soup(home_page_soup)

        for ad in one_page_ads_soup:
            try: 
                ad_data = process_listing(browser, ad, domain_otodom)
                if ad_data:
                    for key in all_data.keys():
                        all_data[key].extend(ad_data[key])
                    save_to_csv(all_data)  # Save after each ad is processed
                    # Clear all_data after saving it
                    for key in all_data.keys():
                        all_data[key].clear()
            except Exception:
                print ('Exception encountered on page {pageNum}.\n')
    browser.quit()

# Run the main function
main()


Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.

Exception encountered on page {pageNum}.



In [24]:
otodomScrapRAWData = pd.read_csv('otodomScrapRAWData.csv')
otodomScrapRAWData.head()

Unnamed: 0,time_scraping,otodom_id,price,title,surface_area,num_of_room,floor,form_of_property,finish_condition,balcony_garden_terrace,parking_space,heating,ad_link,map_link
0,2024-06-06 01:31:51,2-pokojowe-mieszkanie-35m2-loggia-bezposrednio...,Zapytaj o cenę,2-pokojowe mieszkanie 35m2 + loggia Bezpośrednio,"35,41 m²",,3/8,pełna własność,do wykończenia,,garaż/miejsce parkingowe,,https://www.otodom.pl/pl/oferta/2-pokojowe-mie...,"https://www.google.com/maps/@52.172792,20.9941..."
1,2024-06-06 01:31:56,bezposr-2pok-metro-wilanowska-cena-do-negocjac...,970 000 zł,Bezpośr. 2pok. Metro Wilanowska CENA DO NEGOCJ...,"48,55 m²",,7/15,pełna własność,do zamieszkania,,garaż/miejsce parkingowe,miejskie,https://www.otodom.pl/pl/oferta/bezposr-2pok-m...,"https://www.google.com/maps/@52.1823249,21.025..."
2,2024-06-06 01:32:01,2-pokoje-ul-lindego-metro-wawrzyszew-ID4qUHs,629 000 zł,"2 pokoje, ul. Lindego, Metro Wawrzyszew","36,5 m²",,parter/10,pełna własność,do zamieszkania,,,miejskie,https://www.otodom.pl/pl/oferta/2-pokoje-ul-li...,"https://www.google.com/maps/@52.2869133,20.940..."
3,2024-06-06 01:32:08,piekne-ciche-100m-w-kamienicy-powisle-metro-80...,2 480 000 zł,"Piękne, ciche 100m w kamienicy. Powiśle metro ...","99,69 m²",,3/4,spółdzielcze wł. prawo do lokalu,do zamieszkania,,,miejskie,https://www.otodom.pl/pl/oferta/piekne-ciche-1...,"https://www.google.com/maps/@52.23805,21.02935..."
4,2024-06-06 01:32:12,niski-czynsz-ip-winda-ID4qWEi,560 000 zł,Niski Czynsz // Ip. // Winda,50 m²,,1/5,,,,garaż/miejsce parkingowe,,https://www.otodom.pl/pl/oferta/niski-czynsz-i...,"https://www.google.com/maps/@52.4465078,20.692..."
