In [6]:
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np
import cloudscraper
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
import os
import time

from src import zona_prop_url

In [7]:
options = Options()
options.add_argument("--headless") # Ensures GUI is off
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

class Scraper:
    def __init__(self):
        self.driver = None

    def inicio_driver(self, link: str):
        if self.driver is None:
            service = Service(ChromeDriverManager().install())
            carpeta_descarga = os.getcwd() + "/data"
            prefs = {
                'download.default_directory': carpeta_descarga,
                "directory_upgrade": True
            }
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_experimental_option("prefs", prefs)
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            stealth(self.driver,
                    languages=["en-US", "en"],
                    vendor="Google Inc.",
                    platform="Win32",
                    webgl_vendor="Intel Inc.",
                    renderer="Intel Iris OpenGL Engine",
                    fix_hairline=True,
                    )
            self.driver.maximize_window()
        self.driver.get(link)
        return self.driver

scraper = Scraper()
driver = scraper.inicio_driver(zona_prop_url)

# Initialize the lists to store the data
prices = []
locations = []
addresses = []
features = []
descriptions = []
expensas = []

# Start scraping from the first page
while True:
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Select all the div elements with the required class
    property_elements = soup.select('div.sc-1tt2vbg-4.dFNvko')

    # Iterate over each property element
    for property_element in property_elements:
        # Find the child elements with the required data-qa attributes
        price_element = property_element.select_one('div[data-qa="POSTING_CARD_PRICE"]')
        location_element = property_element.select_one('div[data-qa="POSTING_CARD_LOCATION"]')
        address_element = property_element.select_one('div[class="sc-ge2uzh-0 eXwAuU"]')
        features_elements = property_element.select('div[data-qa="POSTING_CARD_FEATURES"] span')
        description_element = property_element.select_one('div[data-qa="POSTING_CARD_DESCRIPTION"]')
        expensas_element = property_element.select_one('div[data-qa="expensas"]')

        # Extract the text from the elements and append it to the lists
        prices.append(price_element.text if price_element else np.nan)
        locations.append(location_element.text if location_element else np.nan)
        addresses.append(address_element.text if address_element else np.nan)
        features.append([feature_element.text for feature_element in features_elements])
        descriptions.append(description_element.text if description_element else np.nan)
        expensas.append(expensas_element.text if expensas_element else np.nan)

    # Check if there is a next page
    next_page = soup.select_one('a[data-qa="PAGING_NEXT"]')
    if next_page and next_page['href']:
        # Prepend the base URL to the href
        full_url = zona_prop_url + next_page['href']
        # Go to the next page
        driver.get(full_url)
    else:
        # No more pages, break the loop
        break

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Price': prices,
    'Location': locations,
    'Address': addresses,
    'Features': features,
    'Description': descriptions,
    'Expensas': expensas
})

df


Unnamed: 0,Price,Location,Address,Features,Description,Expensas
0,$ 1.100.000,"Belgrano C, Belgrano",Libertador al 6200,"[ 75 m² , 75 m² , 70 m² , 70 m² , 1 baño ,...",Hermosa oficina con muy buena luz natural por ...,$ 80.000 Expensas
1,$ 687.500,"Centro / Microcentro, Capital Federal",Maipu al 700,"[ 491 m² , 491 m² , 491 m² , 491 m² , 3 ba...","Edificio de ss, pb y 10 pisos. Ubicado sobre l...",$ 271.952 Expensas
2,USD 5.100,"Barrio Norte, Capital Federal",Av. Las Heras al 3300,"[ 250 m² , 250 m² , 250 m² , 250 m² , 1 am...","Presentamos Edificio "" oficinas las heras "". E...",$ 50.000 Expensas
3,$ 100,"Belgrano, Capital Federal",3 de Febrero al 2100,"[ 150 m² , 150 m² , 554 m² , 554 m² , 7 ba...",Alquiler de inigualables consultorios en 4 Pla...,
4,$ 1.200.000,"Recoleta, Capital Federal",Departamento en Alquiler en Recoleta Furnished...,"[ 45 m² , 45 m² , 45 m² , 45 m² , 1 dorm. ...",Deco recoletafurnished by Armani CasaPiscina d...,
5,$ 700.000,"Barracas, Capital Federal",Av Montes de oca al 800,"[ 370 m² , 370 m² , 370 m² , 370 m² , 2 ba...",Casona antigua con multiples ambientes. Ingres...,
6,USD 5.000,"Parque Patricios, Capital Federal",Av. Colonia al 100,"[ 500 m² , 500 m² , 400 m² , 400 m² , 4 ba...",Edificio de Oficinas entre medianeras de pb y ...,
7,$ 833.340,"Palermo Soho, Palermo",Oro al 2100,"[ 114 m² , 114 m² , 114 m² , 114 m² , 1 am...",** El precio corresponde a tipología 07 (114 m...,$ 39.000 Expensas
8,USD 1.500,"Recoleta, Capital Federal",Av. Alvear 1800,"[ 120 m² , 120 m² , 120 m² , 120 m² , 4 am...",Superficie: 120 m2. Con muebles. Departamento ...,$ 120.000 Expensas
9,USD 1.400,"Puerto Madero, Capital Federal",Venezuela al 100,"[ 200 m² , 200 m² , 1 amb. , 1 amb. , 2 ba...",Excelente oficina en edificio corporativo de c...,$ 96.600 Expensas
