In [24]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from botasaurus import *
import os
import time
import requests
import re
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from config.config_jupyter import set_wd
set_wd()
from src.constants import zona_prop_url

In [2]:
max_number = 1_000_000

In [3]:
def get_page_number_url(number:int) -> str:
    return f'https://www.zonaprop.com.ar/inmuebles-alquiler-capital-federal-pagina-{number}.html'

def get_url_list(max_number:int) -> list[str]:
    request = AntiDetectRequests()
    response = request.get(get_page_number_url(max_number), allow_redirects=True)
    last_page_url = response.url
    match = re.search(r'(\d+)\.html$', last_page_url)
    if match:
        last_page_number = (match.group(1))
    else:
        print("Could not find last webpage, try again in a few minutes")
    page_list = [get_page_number_url(i) for i in range(1, int(last_page_number) + 1)]
    return page_list

In [44]:
def parse_property_listings(driver):
    property_elements = driver.find_elements(By.CLASS_NAME,'sc-1tt2vbg-4.dFNvko' )
    # print(property_elements)
    properties = []
    for property_element in property_elements:
        print(property_element)
        # wait = WebDriverWait(driver, 10)
        # property_element = json.loads(property_element.prettify())
        properties.append(parse_property(property_element, wait))
    return properties

def parse_property(property_element, wait):
    # price_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div[2]/div/div/div[1]/div[2]/div[2]/div[1]/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div/div')))
    try:
        price_element = property_element.find_element(By.CSS_SELECTOR, 'div[data-qa="POSTING_CARD_PRICE"]')
        price = price_element.text
        print(price)
    except NoSuchElementException:
        price_element = np.nan

    try:
        location_element = property_element.find_element(By.CSS_SELECTOR,'div[data-qa="POSTING_CARD_LOCATION"]')
        location = location_element.text
        print(location)
    except NoSuchElementException:
        location = np.nan

    try:
        address_element = property_element.find_element(By.CSS_SELECTOR,'div[class="sc-ge2uzh-0 eXwAuU"]') 
        address = address_element.text
        print(address)
    except NoSuchElementException:
        address = np.nan

    photo_elements = property_element.find_elements(By.TAG_NAME, 'img')
    has_photo = any(photo.get_attribute('src').endswith('isFirstImage=true') for photo in photo_elements)
    print(has_photo)

    try:
        features_elements = property_element.find_elements(By.CSS_SELECTOR, 'div[data-qa="POSTING_CARD_FEATURES"] span')
        features = [feature_element.text for feature_element in features_elements]
        print(features)
    except NoSuchElementException:
        features = []

    try:
        summarize_element = property_element.find_element(By.CSS_SELECTOR, 'a.sc-i1odl-12.EWzaP')
        summarize = summarize_element.text
        print(summarize)
    except NoSuchElementException:
        summarize = np.nan

    try:
        description_element = property_element.find_element(By.CSS_SELECTOR, 'div[data-qa="POSTING_CARD_DESCRIPTION"]')
        description = description_element.text
        print(description)
    except NoSuchElementException:
        description = np.nan

    try:
        expensas_element = property_element.find_element(By.CSS_SELECTOR, 'div[data-qa="expensas"]')
        expensas = expensas_element.text
        print(expensas)
    except NoSuchElementException:
        expensas = np.nan

    try:
        ap_link_element = property_element.find_element(By.CSS_SELECTOR, 'div[data-qa="posting PROPERTY"]')
        ap_link = ap_link_element.get_attribute('data-to-posting')       
        link = 'https://www.zonaprop.com.ar' + ap_link
        print(link)
    except NoSuchElementException:
        link = np.nan

    # soup = property_element#!

        return {
        'Price': price,
        'Location': location,
        'Address': address,
        'Has_photo': has_photo,
        'Features': features,
        'Summarize': summarize,
        'Description': description,
        'Expensas': expensas,
        'Link': link,
    }

In [55]:
@browser(parallel=1, cache=False, data=["https://www.zonaprop.com.ar/inmuebles-alquiler-capital-federal-pagina-1.html"])
def scrap_property_info_zonaprop(driver: AntiDetectDriver, link:str):
    print("inicio programa")
    begin = time.time()
    properties = []
    while True:
        # print("inicio loop")
        try:
            driver.get(link)
            last_height = driver.execute_script("return document.body.scrollHeight")

            while True:
                # Scroll down to bottom
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                # Wait to load page
                time.sleep(20)

                # Calculate new scroll height and compare with last scroll height
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            # driver.exists()
            # time.sleep(10)
            # soup = driver.bs4()
            # soup = driver.page_source
            # soup = BeautifulSoup(soup, 'html.parser')
            properties += parse_property_listings(driver)
            print(properties)
            break
        except requests.exceptions.HTTPError as e:
            print(f"HTTPError occurred: {e}. Retrying in 15 minutes.")
            time.sleep(15*60) # Sleep for 15 minutes
        # except Exception as e:
        #     print(f"An unknown error occurred: {e}.")
        #     break
        
    print(link, time.time()-begin)
    print(properties)
    return properties


In [None]:
scrap_property_info_zonaprop()