In [218]:
import yaml
from datetime import datetime
import pandas as pd

In [168]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [169]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [170]:
edge:object = webdriver.Edge(executable_path="./utils/msedgedriver.exe")

### Functions:

In [171]:
def resources(name:str) -> dict[str:str]:
    """Read xpath.yml file that contains xpath's to scrape"""

    with open("xpath.yml", "r") as f:
        content:dict[str:str] = yaml.safe_load(f)
        return content[name]

In [172]:
def close_window(driver:object) -> None:
    """Close ad window"""

    button:str = "//a[@class='btn btn-subscription-light']"
    driver.find_element(by=By.XPATH, value=button).click()

In [173]:
def choose_category(category:str, driver:object) -> None:
    """Choose ad category"""

    driver.find_element(by=By.XPATH, value=category).click()

In [174]:
def click_element(element:object, path:str) -> None:
    """Click element on the current document"""

    button = element.find_element(by=By.XPATH, value=path)
    button.click()

In [175]:
def extract_description(element:object, path:str) -> str:
    """Extract ad description on the current document"""

    description:str = element.find_element(by=By.XPATH, value=path)
    return description

In [176]:
def extract_prices(element:object, path:str) -> list[str]:
    """Extract prices from list document"""

    prices:str = element.find_elements(by=By.XPATH, value=path)
    return prices

In [None]:
def extract_models(element:object, path:str) -> list[str]:
    """Extract models from list document"""

    models:str = element.find_elements(by=By.XPATH, value=path)
    return models

In [None]:
def persistence(dataframe:pd.Dataframe) -> None:
    """Generate .csv file from data scraped"""

    file_name = datetime.today().strftime("%B-%d")
    dataframe.to_csv(f"persistence/{file_name}.csv", index=False)

## Extraction 🧲

### Sedan cars:

In [226]:
dynamic_delay:object
descriptions:list[str] = []
prices:list[str] = []
models:list[str] = []
# seconds
DELAY:int = 3

In [178]:
sedan = resources("cars_sedan")

In [179]:
edge.get(sedan["url"])

In [180]:
close_window(edge)

In [181]:
elements = edge.find_elements(by=By.XPATH, value=sedan["ad"])

Extract descriptions

In [182]:
head = edge.find_element(by=By.XPATH, value=sedan["ad"])
head.click()
try:
    for element in range(len(elements)):
        dynamic_delay = WebDriverWait(edge, DELAY).until(EC.presence_of_element_located((By.XPATH, sedan["description"])))
        temp = extract_description(edge, sedan["description"])
        descriptions.append(temp.text)
        click_element(edge, sedan["next"])
except TimeoutException as TE:
    descriptions.append("Web wasn't able to scrape descriptions")
except NoSuchElementException as NSE:
    click_element(edge, sedan["exit"])

Extract prices

In [184]:
try:
    prices = edge.find_elements(by=By.XPATH, value=sedan["price"])
    prices = [price.text for price in prices]
except Exception as e:
    prices.append("Web wasn't able to scrape prices")

Extract models

In [207]:
try:
    models = edge.find_elements(by=By.XPATH, value=sedan["model"])
    models = [model.text for model in models]
except Exception as e:
    models.append("Web wasn't able to scrape prices")

## Procesing ⚙

In [209]:
models.remove("Blue Book Encuentra24")

In [211]:
df = pd.DataFrame(
    {
        "model": models,
        "price": prices,
        "description": descriptions
    }
)

In [225]:
df

Unnamed: 0,model,price,description
0,Nissan Sentra,"$9,850",Garantía: Como es visto no hay garantía\nFinan...
1,Kia Forte,"$9,000",Garantía: Como es visto no hay garantía\nFinan...
2,Honda City,"$8,000\n(Rebajado 4%)",Financiamiento: no\nPrecio negociable: si\nCol...
3,Toyota Corolla,"$10,800",Financiamiento: no\nPrecio negociable: si\nDES...
4,Daihatsu Rocky,"$5,000",Color : Rojo y Negro\nSeñor Guzmán contacto
5,Mazda 3,"$5,800\n(Rebajado 2%)",Precio negociable: si\nVendo mazda 3 año 2011 ...
6,Chevrolet Spark,"$7,350",Garantía: Como es visto no hay garantía\nFinan...
7,Honda Civic,"$15,000",Financiamiento: no\nPrecio negociable: si\nCol...
8,Mercedes Benz 230,"$8,500","Color : Plata Iridio Metalizado\nDe agencia, p..."
9,Mitsubishi Mirage,"$3,300",Garantía: Como es visto no hay garantía\nPreci...
