## Init 🍄

---

### Dependencies:

In [1]:
import yaml
import re
from datetime import datetime
import pandas as pd

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [4]:
# edge:object = webdriver.Edge(executable_path="./utils/msedgedriver.exe")
edge = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))
edge.maximize_window()

[WDM] - Downloading: 100%|██████████| 8.15M/8.15M [00:03<00:00, 2.22MB/s]


### Functions:

In [5]:
def resources(name:str) -> dict[str:str]:
    """Read xpath.yml file that contains xpath's to scrape"""

    with open("xpath.yml", "r") as f:
        content:dict[str:str] = yaml.safe_load(f)
        return content[name]

In [6]:
def close_prompt(driver:object) -> None:
    """Close ad window"""

    button:str = "//a[@class='btn btn-subscription-light']"
    driver.find_element(by=By.XPATH, value=button).click()

In [7]:
def choose_category(category:str, driver:object) -> None:
    """Choose ad category"""

    driver.find_element(by=By.XPATH, value=category).click()

In [8]:
def click_element(element:object, path:str) -> None:
    """Click element on the current document"""

    button = element.find_element(by=By.XPATH, value=path)
    button.click()

In [9]:
def extract_description(element:object, path:str) -> str:
    """Extract ad description on the current document"""

    description:str = element.find_element(by=By.XPATH, value=path)
    return description

In [10]:
def extract_feature(element:object, path:str) -> list[str]:
    """Extract some feature from list document based on xpath used."""

    feature = []

    try:
        feature:str = element.find_elements(by=By.XPATH, value=path)
        feature = [data_point.text for data_point in feature]
        return feature
    except Exception:
        feature.append("Web wasn't able to scrape the feature")

In [11]:
def persistence(dataframe:pd.DataFrame) -> None:
    """Generate .csv file from data scraped"""

    file_name = datetime.today().strftime("%B-%d")
    dataframe.to_csv(f"persistence/{file_name}.csv", index=False)

In [12]:
def get_len(features:list[list[str]]) -> bool:
    """
    Check if all features has the same length to create new df
    :param features: list of features extracted with scraping
    :return: a list with all lengths where first place is the len of the first element and the mostly accepted.
    """

    base_len = len(features[0])
    lengths = []
    lengths.append(base_len)
    for feature in features:
        if len(feature) == base_len:
            continue
        else:
             lengths.append(len(feature))
    return lengths

In [13]:
def check_len(lengths:list[int]) -> bool:
    """
    Check if all elements in list has the same value.
    :param lengths: list of lengths
    :return: a boolean
    """
    for length in lengths[1:]:
        if length != lengths[0]:
            return False
        return True

## Extraction 🧲

----

### Sedan:

Variables pool.

In [14]:
descriptions:list[str] = []
prices:list[int] = []
models:list[str] = []
years:list[int] = []
kilometers:list[int] = []
engines:list[str] = []

# seconds
DELAY:int = 1

In [15]:
sedan = resources("cars_sedan")

Get web & close initial prompt.

In [16]:
edge.get(sedan["url"])

In [17]:
close_prompt(edge)

In [18]:
elements = edge.find_elements(by=By.XPATH, value=sedan["ad"])

Extract descriptions.

In [19]:
elements[0].click()
try:
    for element in range(len(elements)):
        dynamic_delay = WebDriverWait(edge, DELAY).until(EC.presence_of_element_located((By.XPATH, sedan["description"])))
        temp = extract_description(edge, sedan["description"])
        descriptions.append(temp.text)
        click_element(edge, sedan["next"])
except TimeoutException as TE:
    descriptions.append("Web wasn't able to scrape descriptions")
except NoSuchElementException as NSE:
    click_element(edge, sedan["exit"])

Extract prices & models.

In [20]:
prices = extract_feature(edge, sedan["price"])
models = extract_feature(edge, sedan["model"])

Extract car year, engine & kilometers.

In [21]:
years = extract_feature(edge, sedan["year"])
engines = extract_feature(edge, sedan["engine"])
kilometers = extract_feature(edge, sedan["kilometers"])

In [22]:
lengths = [prices, models, years, engines, kilometers, descriptions]
get_len(lengths)

[42, 43, 43, 41, 41]

## Cleaning 🤿

### Functions:

In [23]:
def parse_price(dataframe: pd.DataFrame, feature:str, pattern:str) -> None:
    """
    Parse financial features like prices. Works with $#,### or $#,###\n* pattern
    :param dataframe: object towards apply transformation
    :param feature: feature name towards apply transformation
    :param pattern: feature pattern to transform
    """
    dataframe[feature] = dataframe [feature] \
        .str.extract(pattern) \
        .applymap(lambda price: int(price.replace(",","")))


In [24]:
def remove_chars(dataframe: pd.DataFrame, feature:str, wildcards:str) -> pd.Series:
    """
    Remove escaped characters from features like indented large extracts of text
    :param dataframe: object towards apply transformation
    :param feature: feature name towards apply transformation
    :param wildcards: escaped characters to remove
    :return: Serie without escaped characters
    """
    return (dataframe
    .apply(
        lambda row: row[feature]
        ,axis = 1
    )
    .apply(
        lambda extracted_f: re.sub(wildcards, r' ', extracted_f)
    )
    )


In [86]:
def replace_datapoint(value:str, replacement:any, datalist: list[any]) -> None:
    """
    Replace a datapoint if we know the value of datapoint to replace. Useful for kilometers list bug.
    :param value: value of datapoint to remove
    :param replacement: value of datapoint to insert
    :param datalist: list where the operation will be applied
    """
    try:
        to_replace = datalist.index(value)
        datalist[to_replace] = replacement
    except ValueError as ve:
        print("That values doesn't exists")

In [25]:
PRICE_RG: str = r"^\$(?P<prices>\d+,\d+)?"
KM_REG: str = r"(?P<km>^\d+)"
CHARACTER_RG:str = '(\n|\r)+'

---

Remove noise elements from data list

In [26]:
# engines.append("Gasolina")

In [27]:
years.remove("Ver más")

In [28]:
models.remove("Blue Book Encuentra24")

In [29]:
# kilometers.remove("Gasolina")
# kilometers.insert(-2, "0")

In [88]:
# # engines.pop(-1)

12

Create dataframe to process data extracted

In [68]:
lengths = [len(engines), len(models), len(years), len(kilometers), len(prices),len(descriptions)]
lengths

[42, 42, 42, 42, 42, 42]

This is necessary when exists an announcement with no description or engine. Basically to append missing values.

In [43]:
# descriptions.insert(-15, "No description")

In [33]:
# descriptions.append("No description")

In [76]:
# -7
kilometers[-7]

'0 km'

In [77]:
if check_len(lengths):

    df_temp = pd.DataFrame({
        "model": models,
        "year": years,
        "kilometers": kilometers,
        "engine": engines,
        "price": prices,
        "description": descriptions
    })

    # Parse features. String to number and raw string with characters to structured string.
    parse_price(df_temp, "price", PRICE_RG)
    df_temp["kilometers"] = df_temp["kilometers"].str.extract(KM_REG).applymap(int)
    df_temp["year"] = df_temp["year"].map(int)
    df_temp["description"] = remove_chars(df_temp, "description", CHARACTER_RG)

    # Export dataframe in .csv format
    persistence(df_temp)

In [78]:
df_temp

Unnamed: 0,model,year,kilometers,engine,price,description
0,Mitsubishi Mirage G4,2021,3000,Gasolina,12000,Precio negociable: si Color : Negro perlado CO...
1,Toyota Corolla,2018,39000,Gasolina,11500,Color : Azul DESCRIPCIÓN CALIFICACIONES SEGURI...
2,Nissan Versa,2017,78000,Gasolina,7400,Financiamiento: si DESCRIPCIÓN CALIFICACIONES ...
3,Kia Forte,2016,34000,Gasolina,6900,Garantía: Como es visto no hay garantía Color ...
4,Nissan Sentra,2015,89300,Gasolina,7600,Garantía: Como es visto no hay garantía Financ...
5,Nissan Sentra,2015,112646,Gasolina,5975,Garantía: Como es visto no hay garantía Financ...
6,Chevrolet Spark,2020,83903,Gasolina,7350,Precio negociable: si Color : Gris DESCRIPCIÓN...
7,Suzuki Aerio,2003,150000,Gasolina,3000,Color : GRIS Nissan Altima 2013 Full Extras Au...
8,Nissan ALTIMA,2013,110065,Gasolina,6500,Precio negociable: si Color : Gris En venta Hy...
9,Hyundai Elantra,2018,22500,Gasolina,10200,Color : Negro DESCRIPCIÓN CALIFICACIONES SEGUR...
