### Dependencies:

In [None]:
import yaml
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [None]:
edge:object = webdriver.Edge(executable_path="./utils/msedgedriver.exe")

### Functions:

In [None]:
def resources(name:str) -> dict[str:str]:
    """Read xpath.yml file that contains xpath's to scrape"""

    with open("xpath.yml", "r") as f:
        content:dict[str:str] = yaml.safe_load(f)
        return content[name]

In [None]:
def close_prompt(driver:object) -> None:
    """Close ad window"""

    button:str = "//a[@class='btn btn-subscription-light']"
    driver.find_element(by=By.XPATH, value=button).click()

In [None]:
def choose_category(category:str, driver:object) -> None:
    """Choose ad category"""

    driver.find_element(by=By.XPATH, value=category).click()

In [None]:
def click_element(element:object, path:str) -> None:
    """Click element on the current document"""

    button = element.find_element(by=By.XPATH, value=path)
    button.click()

In [None]:
def extract_description(element:object, path:str) -> str:
    """Extract ad description on the current document"""

    description:str = element.find_element(by=By.XPATH, value=path)
    return description

In [None]:
def extract_feature(element:object, path:str) -> list[str]:
    """Extract some feature from list document based on xpath used."""

    feature = []

    try:
        feature:str = element.find_elements(by=By.XPATH, value=path)
        feature = [data_point.text for data_point in feature]
        return feature
    except Exception:
        feature.append("Web wasn't able to scrape the feature")



In [None]:
def persistence(dataframe:pd.DataFrame) -> None:
    """Generate .csv file from data scraped"""

    file_name = datetime.today().strftime("%B-%d")
    dataframe.to_csv(f"persistence/{file_name}.csv", index=False)

In [None]:
def parse_price(dataframe: pd.DataFrame, column:str, pattern:str) -> None:
    """
    Cast string prices to int prices.
    Works with $#,### or $#,###\n* pattern
    """


    dataframe[column] = dataframe [column] \
        .str.extract(pattern) \
        .applymap(lambda price: int(price.replace(",","")))


## Extraction 🧲

### Sedan:

Variables pool.

In [None]:
descriptions:list[str] = []
prices:list[int] = []
models:list[str] = []
years:list[int] = []
kilometers:list[int] = []
engines:list[str] = []

# seconds
DELAY:int = 3

In [None]:
sedan = resources("cars_sedan")

Get web & close initial prompt.

In [None]:
edge.get(sedan["url"])

In [None]:
# close_prompt(edge)

In [None]:
elements = edge.find_elements(by=By.XPATH, value=sedan["ad"])

Extract descriptions.

In [None]:
elements[0].click()
try:
    for element in range(len(elements)):
        dynamic_delay = WebDriverWait(edge, DELAY).until(EC.presence_of_element_located((By.XPATH, sedan["description"])))
        temp = extract_description(edge, sedan["description"])
        descriptions.append(temp.text)
        click_element(edge, sedan["next"])
except TimeoutException as TE:
    descriptions.append("Web wasn't able to scrape descriptions")
except NoSuchElementException as NSE:
    click_element(edge, sedan["exit"])

Extract prices & models.

In [None]:
prices = extract_feature(edge, sedan["price"])
models = extract_feature(edge, sedan["model"])
models.remove("Blue Book Encuentra24")

Extract car year, engine & kilometers.

In [None]:
years = extract_feature(edge, sedan["year"])
engines = extract_feature(edge, sedan["engine"])
kilometers = extract_feature(edge, sedan["kilometers"])

## Transformation ⚙

There exist a missing kilometer value, so in kilometers actually was append a engine type instead a kilometer value due it's value is missing.

In [None]:
# years.remove("Ver más")
# engines.append("Gasolina")
# kilometers.remove("Gasolina")
# kilometers.insert(-2,"0")

Temporal df to apply transformation and after save it as csv.

In [None]:
df_temp = pd.DataFrame({
    "model": models,
    "year": years,
    "kilometers": kilometers,
    "engine": engines,
    "price": prices,
    "description": descriptions
})

### Parse columns:

Chance column dtype from str -> int

In [None]:
price_rg:str = r"^\$(?P<prices>\d+,\d+)?"
parse_price(df_temp, "price", price_rg)

In [None]:
km_rg: str = r"(?P<km>^\d+)"
df_temp["kilometers"] = df_temp["kilometers"].str.extract(km_rg).applymap(int)

In [None]:
df_temp["year"] = df_temp["year"].map(int)

In [209]:
persistence(df_temp)

### Stats:

Basic information about new extraction stored in df.

In [208]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        45 non-null     object
 1   year         45 non-null     int64 
 2   kilometers   45 non-null     int64 
 3   engine       45 non-null     object
 4   price        45 non-null     int64 
 5   description  45 non-null     object
dtypes: int64(3), object(3)
memory usage: 2.2+ KB


In [206]:
df_temp.describe()

Unnamed: 0,year,kilometers,price
count,45.0,45.0,45.0
mean,2013.622222,87042.177778,8170.0
std,5.609606,60524.943627,2799.617466
min,1992.0,0.0,2900.0
25%,2011.0,46000.0,6500.0
50%,2016.0,81570.0,8150.0
75%,2017.0,127580.0,10000.0
max,2021.0,262634.0,15000.0
