## 3. Feature Engineering
📒 `3.0-rc-feature-engineering.ipynb`

**Objetivo:** Gerar variáveis que melhorem a capacidade preditiva do modelo.

⚙️ **Atividades:**
- Criação de variáveis derivadas (novas features);
- Codificação de variáveis categóricas (Label Encoding ou One-Hot);
- Transformações matemáticas (Log, raiz quadrada, quadrado, binning (faixas));
- Encoding avançado
- Seleção de features relevantes
- Redução de dimensionalidade (se aplicável)

- Remover colunas PCA (`engine-location`)
- Adiconar novas features, como: `car-profile` 


In [71]:
import pandas as pd
import numpy as np
import warnings

# Setup para mostrar todas as colunas do dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Desabilita mensagens de FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [72]:
file_path = "../data/processed/car_price_cleaned.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,risk_classification,price-binned,avg-fuel-consumption
0,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,high,medium,10.203456
1,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,high,medium,10.203456
2,1,115,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,moderate,medium,9.56574
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,high,medium,11.478888
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,high,medium,8.50288


#### Criação de novas features / enriquecimento de dados

In [66]:
# Potencia Relativa (power-to-weight)
# Indica o desempenho do carro com base na relação entre potência e peso — quanto maior, mais potente por quilo.
# O valor é multiplicado por 1000 para facilitar a leitura, sendo assim, temos a força (cv) por 1000 kg
# Necessário converter a medida HP para CV - fator 1.01387 para cada HP

df['power-to-weight-ratio'] = ((df['horsepower'] * 1.01387) / df['curb-weight'] ) *1000

In [67]:
# Consumo Médio  (avg-fuel-consumption) em Km/l
# Fator de Conversão Km/l =  1mpg * 0.425144
df['avg-fuel-consumption'] = ((df['city-mpg'] + df['highway-mpg']) * 0.425144) / 2
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,risk_classification,price-binned,power-to-weight-ratio,avg-fuel-consumption
0,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,high,medium,44.167806,10.203456
1,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,high,medium,44.167806,10.203456
2,1,115,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,moderate,medium,55.30853,9.56574
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,high,medium,44.251065,11.478888
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,high,medium,41.287199,8.50288


In [None]:
# Aplica a função map() criando uma nova coluna
def map_profile(style):
    if style in ['sedan', 'wagon']:
        return 'familiar'
    elif style == 'hatchback':
        return 'urbano'
    elif style in ['convertible', 'hardtop']:
        return 'esportivo'
    else:
        return 'outro'

df['car-profile'] = df['body-style'].apply(map_profile)

#### Codificação de variáveis categóricas (Label Encoding ou One-Hot);

In [68]:
# Seleciona as variáveis categóricas
categorical_features = df.select_dtypes(include='object').columns.to_list()
categorical_features

['make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'engine-type',
 'num-of-cylinders',
 'fuel-system',
 'risk_classification',
 'price-binned']

In [69]:
# One-Hot Encoding para variáveis categóricas nominais
df = pd.get_dummies(df, columns=categorical_features)
df.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,power-to-weight-ratio,avg-fuel-consumption,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_mazda,make_mercedes-benz,make_mercury,make_mitsubishi,make_nissan,make_peugot,make_plymouth,make_porsche,make_renault,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel-type_diesel,fuel-type_gas,aspiration_std,aspiration_turbo,num-of-doors_four,num-of-doors_two,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,engine-location_front,engine-type_dohc,engine-type_dohcv,engine-type_l,engine-type_ohc,engine-type_ohcf,engine-type_ohcv,engine-type_rotor,num-of-cylinders_eight,num-of-cylinders_five,num-of-cylinders_four,num-of-cylinders_six,num-of-cylinders_three,num-of-cylinders_two,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi,risk_classification_high,risk_classification_low,risk_classification_moderate,price-binned_high,price-binned_low,price-binned_medium
0,3,115,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495,44.167806,10.203456,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,True,False,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True
1,3,115,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500,44.167806,10.203456,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,True,False,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True
2,1,115,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500,55.30853,9.56574,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True
3,2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950,44.251065,11.478888,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True
4,2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450,41.287199,8.50288,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,True,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True


In [70]:
# Salva o dataset enriquecido
path_to_save = "../data/processed/car_price_enriched.csv"
df.to_csv(path_to_save, index=False)