### Dependencies:

In [None]:
import yaml
from datetime import datetime
import pandas as pd

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [None]:
edge:object = webdriver.Edge(executable_path="./utils/msedgedriver.exe")

### Functions:

In [None]:
def resources(name:str) -> dict[str:str]:
    """Read xpath.yml file that contains xpath's to scrape"""

    with open("xpath.yml", "r") as f:
        content:dict[str:str] = yaml.safe_load(f)
        return content[name]

In [None]:
def close_prompt(driver:object) -> None:
    """Close ad window"""

    button:str = "//a[@class='btn btn-subscription-light']"
    driver.find_element(by=By.XPATH, value=button).click()

In [None]:
def choose_category(category:str, driver:object) -> None:
    """Choose ad category"""

    driver.find_element(by=By.XPATH, value=category).click()

In [None]:
def click_element(element:object, path:str) -> None:
    """Click element on the current document"""

    button = element.find_element(by=By.XPATH, value=path)
    button.click()

In [None]:
def extract_description(element:object, path:str) -> str:
    """Extract ad description on the current document"""

    description:str = element.find_element(by=By.XPATH, value=path)
    return description

In [None]:
def extract_prices(element:object, path:str) -> list[str]:
    """Extract prices from list document"""

    try:
        prices:str = element.find_elements(by=By.XPATH, value=path)
        prices = [price.text for price in prices]
        return prices
    except Exception as e:
    prices.append("Web wasn't able to scrape prices")


In [None]:
def extract_models(element:object, path:str) -> list[str]:
    """Extract models from list document"""

    try:
        models:str = element.find_elements(by=By.XPATH, value=path)
        models = [model.text for model in models]
        return models
    except Exception as e:
    models.append("Web wasn't able to scrape models")


In [None]:
def persistence(dataframe:pd.Dataframe) -> None:
    """Generate .csv file from data scraped"""

    file_name = datetime.today().strftime("%B-%d")
    dataframe.to_csv(f"persistence/{file_name}.csv", index=False)

In [None]:
def parse_prices(dataframe: pd.DataFrame, column:str) -> None:
    """
    Cast string prices to int prices.
    Works with $#,### or $#,###\n* pattern
    """

    price_rg:str = r"^\$(?P<prices>\d+,\d+)?"

    dataframe[column] = df[column] \
        .str.extract(price_rg) \
        .applymap(lambda price: int(price.replace(",","")))


## Extraction 🧲

### Sedan:

Variables pool.

In [None]:
descriptions:list[str] = []
prices:list[str] = []
models:list[str] = []
# seconds
DELAY:int = 3

In [None]:
sedan = resources("cars_sedan")

Get web & close initial prompt.

In [None]:
edge.get(sedan["url"])
close_prompt(edge)

In [None]:
elements = edge.find_elements(by=By.XPATH, value=sedan["ad"])

Extract descriptions.

In [None]:
elements[0].click()
try:
    for element in range(len(elements)):
        dynamic_delay = WebDriverWait(edge, DELAY).until(EC.presence_of_element_located((By.XPATH, sedan["description"])))
        temp = extract_description(edge, sedan["description"])
        descriptions.append(temp.text)
        click_element(edge, sedan["next"])
except TimeoutException as TE:
    descriptions.append("Web wasn't able to scrape descriptions")
except NoSuchElementException as NSE:
    click_element(edge, sedan["exit"])

Extract prices & models.

In [None]:
prices = extract_prices(edge, sedan["price"])
models = extract_models(edge, sedan["model"])
models.remove("Blue Book Encuentra24")

## Procesing ⚙

In [None]:
# working with persistence data
df = pd.read_csv("./persistence/February-10.csv")

Remove invasive ad from list document.

### Price column:

Chance price dtype from str -> int

In [None]:
parce_prices(df)

In [None]:
df.info()