## 3. Data Pre-Processing
📒 `3.0-rc-data-pre-processing.ipynb`

**Objetivo:** 

⚙️ **Atividades:**


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Classificar as features dos veículos
tech_map = {
    'perform': ['engine-size', 'horsepower', 'compression-ratio',
                    'bore', 'stroke', 'peak-rpm', 'num-of-cylinders', 'engine-type',
                    'fuel-system', 'engine-location'],
    'design': ['body-style', 'num-of-doors', 'drive-wheels', 'wheel-base',
               'curb-weight', 'length', 'width', 'height'],
    'mercado': ['make', 'fuel-type', 'aspiration', 'avg-mpg'],
    'risk_insurance': ['symboling'],
    'cost_losses': ['normalized-losses']
}
# Neste exemplo, vamos considerar que as caracteristicas de 'powertrain' e 'design' referem-se a caracteristicas Técnicas do veículo

features_map = {
    'technical': ['engine-size', 'horsepower', 'curb-weight', 'compression-ratio',
                    'bore', 'stroke', 'peak-rpm', 'num-of-cylinders', 'engine-type',
                    'fuel-system', 'engine-location''body-style', 'num-of-doors', 
                    'drive-wheels', 'wheel-base','length', 'width', 'height'],

    'mercado': ['make', 'fuel-type', 'aspiration', 'avg-mpg'],
    'risk_insurance': ['symboling'],
    'cost_losses': ['normalized-losses']
}

In [5]:
file_path = "../data/processed/car_price_prep.csv"
df = pd.read_csv(file_path)

### Feature Engineering
- Criação de novas features / enriquecimento de dados

##### Feature: `price-binned` - Classificação do Preço (Binning)

In [6]:
# Definir o número de bins. Lembrando Bins = ponto de corte - 1, neste exemplo, precisamos de 3bins correspondentes as faixas de preço
bins_price = np.linspace(df['price'].min(), df['price'].max(), 4)
group_prices = ['low', 'medium', 'high']

# Classifica preço em intervalos definidos acima
df['price-binned'] = pd.cut(df['price'], bins=bins_price, labels=group_prices, include_lowest=True ).astype('object')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,wheel-base,length,...,bore,stroke,compression-ratio,horsepower,peak-rpm,price,price-binned,risk_insurance,car-profile,avg-mpg
0,3,115,alfa-romero,gas,std,two,convertible,rwd,88.6,168.8,...,3.47,2.68,9.0,111,5000,13495,medium,high,sport/premium,24.0
1,3,115,alfa-romero,gas,std,two,convertible,rwd,88.6,168.8,...,3.47,2.68,9.0,111,5000,16500,medium,high,sport/premium,24.0
2,1,115,alfa-romero,gas,std,two,hatchback,rwd,94.5,171.2,...,2.68,3.47,9.0,154,5000,16500,medium,moderate,utility,22.5
3,2,164,audi,gas,std,four,sedan,fwd,99.8,176.6,...,3.19,3.4,10.0,102,5500,13950,medium,high,utility,27.0
4,2,164,audi,gas,std,four,sedan,4wd,99.4,176.6,...,3.19,3.4,8.0,115,5500,17450,medium,high,utility,20.0


##### Feature: `risk_insurance` - Classificação Variavel `symboling`

In [None]:
# Aplica o Mapping na variável `symboling`
symboling_map = {
    -3: 'low',
    -2:'low',
    -1:'moderate',
    0: 'neutral',
    1:'moderate',
    2:'high',
    3:'high'
}

# Cria nova coluna com a classificação de risco agrupadas 
df['risk_insurance'] = df['symboling'].map(symboling_map)

# Verifica os valores únicos
df['risk_insurance'].unique()