## Dependencies

---

In [10]:
import os
import re

import pandas as pd
# %run "./Extraction.ipynb"

## Transformation

---

### Functions:

In [None]:
def replace_datapoint(value:str, replacement:any, datalist: list[any]) -> None:
    """
    Replace a datapoint if we know the value of datapoint to replace. Useful for kilometers list bug.
    :param value: value of datapoint to remove.
    :param replacement: value of datapoint to insert.
    :param datalist: list where the operation will be applied.
    """
    try:
        to_replace = datalist.index(value)
        datalist[to_replace] = replacement
    except ValueError as ve:
        print("That values doesn't exists")


In [None]:
def fix_tuples(csv:str, index:int) -> pd.DataFrame:
    """
    Fix displaced tuples in csv file.
    :param csv: name of file to fix (without extension).
    :param index: index of registry in description column to delete.
    :return: dataframe with tuples fixed.
    """
    dataframe = pd.read_csv(f"persistence/{csv}.csv")

    models = list(dataframe["model"])
    year = list(dataframe["year"])
    kilometer =  list(dataframe["kilometers"])
    engine = list(dataframe["engine"])
    price = list(dataframe["price"])
    descriptions = list(dataframe["description"])

    lists = [models, year, kilometer, engine, price]
    lists = [data_list.pop(0) for data_list in lists]
    descriptions.pop(index)

    return pd.DataFrame({
        "model": models,
        "year": year,
        "kilometers": kilometer,
        "engine": engine,
        "price": price,
        "description": descriptions
    })


In [71]:
def group_ds(func) -> list[pd.DataFrame]:
   """
   Decorator to transform the list of files names to list of dataframes.
   :param func: function to decorate.
   :return: list of dataframes.
   """
   def wrapper(*args):
       files = func(*args)
       dsets = []
       for file in files:
           frame = pd.read_csv(f"persistence/{file}")
           dsets.append(frame)
       return dsets
   return wrapper

In [72]:
@group_ds
def list_ds(month:str) -> list["str"]:
    """
    List files in persistence directory by month.
    :param month: dataset prefix.
    :return: list of datasets with month prefix.
    """
    path = os.getcwd()
    files = os.listdir(f"{path}/persistence")
    return [file for file in files if file.startswith(month)]


### Messy datapoints:

 Since some ads don't have their fields complete, the scraper skip to the next field extracting for example the `engine` value inside `kilometers`, due kilometers field was empty and was skipped.

In [None]:
frame = pd.read_csv("persistence/April-08.csv")

### Displaced tuples:

Since some ads structure and the way that data was extracted happens a displacement for one ad that wasn't extracted as it should be, so I've to remove some unusable datapoins to keep the coherence between `model` and `description`, due the unwanted behavior affect the `description` values.

In [None]:
frame = fix_tuples("April-08", 33)

In [None]:
frame.tail()

In [None]:
frame.to_csv("persistence/March-21.csv", index=False)

### Unique ID:

### Concatenation:

In [53]:
april = []

for file in list_ds("April"):
    frame = pd.read_csv(f"persistence/{file}")
    april.append(frame)

In [62]:
concatenation = pd.concat(april)

In [66]:
concatenation = (
    concatenation
    .drop_duplicates()
    .reset_index()
)

In [67]:
concatenation = concatenation.drop("index", axis=1)

In [68]:
concatenation

Unnamed: 0,model,year,kilometers,engine,price,description
0,Nissan Versa,2017,78000,Gasolina,7400,Color : Azul DESCRIPCIÓN CALIFICACIONES SEGURI...
1,Kia Forte,2016,34000,Gasolina,6900,Financiamiento: si DESCRIPCIÓN CALIFICACIONES ...
2,Kia Rio,2020,56000,Gasolina,10500,Color : Gris Se vende Kia Río 2020. Automático...
3,Mitsubishi Lancer,2015,97783,Gasolina,6975,Garantía: Como es visto no hay garantía Financ...
4,Mitsubishi Mirage G4,2018,60000,Gasolina,8500,Color : Blanco Se vende Mitsubishi G4 2018. Au...
...,...,...,...,...,...,...
62,Chevrolet Aveo,2017,43000,Gasolina,14300,Color : Vino metálico Mantenimiento al día en ...
63,Volkswagen Golf,1996,200000,Gasolina,2000,Garantía: Como es visto no hay garantía Financ...
64,Nissan Sentra,2017,86000,Gasolina,7900,Precio negociable: si Color : gris DESCRIPCIÓN...
65,Nissan Sentra,2002,0,Gasolina,4500,DESCRIPCIÓN CALIFICACIONES SEGURIDAD Nissan se...


In [77]:
list_ds("April")[2]

Unnamed: 0,model,year,kilometers,engine,price,description
0,Nissan Versa,2017,78000,Gasolina,7400,Color : Azul DESCRIPCIÓN CALIFICACIONES SEGURI...
1,Kia Forte,2016,34000,Gasolina,6900,Financiamiento: si DESCRIPCIÓN CALIFICACIONES ...
2,Kia Rio,2020,56000,Gasolina,10500,Color : Gris Se vende Kia Río 2020. Automático...
3,Mitsubishi Lancer,2015,97783,Gasolina,6975,Garantía: Como es visto no hay garantía Financ...
4,Mitsubishi Mirage G4,2018,60000,Gasolina,8500,Color : Blanco Se vende Mitsubishi G4 2018. Au...
5,Nissan Sentra,2017,87000,Gasolina,8500,Precio negociable: si Color : gris DESCRIPCIÓN...
6,Toyota Yaris,2008,125000,Gasolina,7600,Financiamiento: no Precio negociable: si Color...
7,Nissan Sentra,2015,60000,Gasolina,9400,Precio negociable: si Color : Negro DESCRIPCIÓ...
8,Toyota Corolla,2004,190000,Gasolina,6600,Financiamiento: no Color : Beige DESCRIPCIÓN C...
9,Volkswagen Jetta,2002,256000,Gasolina,4350,Garantía: Como es visto no hay garantía Financ...
