In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import seaborn as sns
import joblib

# Feature Engineering

In [26]:
df=pd.read_csv('data/train.csv')

In [27]:
def transform_running(run):
    if run[-2:]=='km':
        return float(run.replace('km',''))
    else:
        return float(run.replace('miles',''))*1.609344

In [28]:
df['running']=df['running'].apply(transform_running)
df

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
0,toyota,2022,petrol,3000.0000,left,skyblue,sedan,excellent,2.0,24500
1,mercedes-benz,2014,petrol,132000.0000,left,black,sedan,excellent,2.0,25500
2,kia,2018,petrol,152887.6800,left,other,sedan,excellent,2.0,11700
3,mercedes-benz,2002,petrol,220480.1280,left,golden,sedan,excellent,3.2,12000
4,mercedes-benz,2017,petrol,130000.0000,left,black,sedan,good,2.0,26000
...,...,...,...,...,...,...,...,...,...,...
1637,hyundai,2017,petrol,193121.2800,left,white,sedan,good,2.0,12400
1638,toyota,2014,petrol,170000.0000,left,black,sedan,good,2.0,16500
1639,nissan,2018,petrol,110883.8016,left,blue,suv,good,2.0,19500
1640,nissan,2019,petrol,49889.6640,left,black,suv,excellent,2.0,19500


**Trato de nulos**

In [29]:
for col in df.columns:
    n_nulos=df[col].isnull().sum()
    print(f'La columna {col} tiene {n_nulos} nulos')

La columna model tiene 0 nulos
La columna year tiene 0 nulos
La columna motor_type tiene 0 nulos
La columna running tiene 0 nulos
La columna wheel tiene 0 nulos
La columna color tiene 0 nulos
La columna type tiene 0 nulos
La columna status tiene 0 nulos
La columna motor_volume tiene 0 nulos
La columna price tiene 0 nulos


Se puede ver que no hay nulos dentro de los datos.

## Variable Calificadora

In [30]:
qual_mappings = {'excellent': 3, 'good':2, 'crashed': 0, 'normal': 1, 'new': 4}

In [31]:
df['status'] = df['status'].map(qual_mappings)

## Variable Categorica

Se elimina la variable wheel porque solo existe un valor en esa columna.

In [32]:
df=df.drop('wheel',axis=1)

Se crea OneHotEncoder para codificar las variables categoricas

In [33]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[['model', 'motor_type','color','type']])

In [34]:
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['model', 'motor_type','color','type']))

In [39]:
df

Unnamed: 0,model,year,motor_type,running,color,type,status,motor_volume,price
0,toyota,2022,petrol,3000.0000,skyblue,sedan,3,2.0,24500
1,mercedes-benz,2014,petrol,132000.0000,black,sedan,3,2.0,25500
2,kia,2018,petrol,152887.6800,other,sedan,3,2.0,11700
3,mercedes-benz,2002,petrol,220480.1280,golden,sedan,3,3.2,12000
4,mercedes-benz,2017,petrol,130000.0000,black,sedan,2,2.0,26000
...,...,...,...,...,...,...,...,...,...
1637,hyundai,2017,petrol,193121.2800,white,sedan,2,2.0,12400
1638,toyota,2014,petrol,170000.0000,black,sedan,2,2.0,16500
1639,nissan,2018,petrol,110883.8016,blue,suv,2,2.0,19500
1640,nissan,2019,petrol,49889.6640,black,suv,3,2.0,19500


In [42]:
df = pd.concat([df[['year','running','status','motor_volume','price']].reset_index(drop=True), encoded_df], axis=1)



In [43]:
df

Unnamed: 0,year,running,status,motor_volume,price,model_hyundai,model_kia,model_mercedes-benz,model_nissan,model_toyota,...,color_silver,color_skyblue,color_white,type_Coupe,type_Universal,type_hatchback,type_minivan / minibus,type_pickup,type_sedan,type_suv
0,2022,3000.0000,3,2.0,24500,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2014,132000.0000,3,2.0,25500,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2018,152887.6800,3,2.0,11700,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2002,220480.1280,3,3.2,12000,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2017,130000.0000,2,2.0,26000,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,2017,193121.2800,2,2.0,12400,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1638,2014,170000.0000,2,2.0,16500,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1639,2018,110883.8016,2,2.0,19500,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1640,2019,49889.6640,3,2.0,19500,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


El valor petrol and gas en la columna motor type, es un conjunto de la variable gas y petrol, por tanto se transformara para que los registros que tengan ese valor activo se eliminara y se editara la variable motor_type gas y petroleo.

In [51]:
df['motor_type_gas']=df.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_gas'],axis=1)
df['motor_type_petrol']=df.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_petrol'],axis=1)
df=df.drop(['motor_type_petrol and gas'], axis=1)

In [52]:
df.dtypes

year                        int64
running                   float64
status                      int64
motor_volume              float64
price                       int64
model_hyundai             float64
model_kia                 float64
model_mercedes-benz       float64
model_nissan              float64
model_toyota              float64
motor_type_diesel         float64
motor_type_gas            float64
motor_type_hybrid         float64
motor_type_petrol         float64
color_beige               float64
color_black               float64
color_blue                float64
color_brown               float64
color_cherry              float64
color_clove               float64
color_golden              float64
color_gray                float64
color_green               float64
color_orange              float64
color_other               float64
color_pink                float64
color_purple              float64
color_red                 float64
color_silver              float64
color_skyblue 

Se divide el dataset en entrenamiento y validación

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['price'], axis=1), # predictive variables
    df['price'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((1477, 37), (165, 37))

Se agrega Escalador para valores numericos

In [55]:
scaler = MinMaxScaler()

# Ajustamos el Scaler para el Conjunto de Entrenamiento
scaler.fit(X_train)

# Transformamos el conjunto de entrenamiento y el conjunto de prueba

# sklearn devuelve arrays de numpy, así que envolvemos el
# array con un dataframe de pandas

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [57]:

X_train.to_csv('./output/xtrain.csv', index=False)
X_test.to_csv('./output/xtest.csv', index=False)

y_train.to_csv('./output/ytrain.csv', index=False)
y_test.to_csv('./output/ytest.csv', index=False)

joblib.dump(scaler, './output/minmax_scaler.joblib')

['./output/minmax_scaler.joblib']