In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import Libs.preprocessors as pp

# Feature Engineering

In [2]:
df=pd.read_csv('data/train.csv')

In [3]:
df['running']=df['running'].apply(lambda x: float(x.replace('km','')) if x[-2:]=='km' else float(x.replace('miles',''))*1.609344)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['price','wheel'], axis=1), # predictive variables
    df['price'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((1477, 8), (165, 8))

In [5]:
ONEHOTENCODER_VARS = ['model', 'motor_type','color','type']

TEMPORAL_VARS = ['year']

NUMERICALS_YEO_VARS = ["running", "motor_volume"]

QUAL_VARS = [
    'status']

qual_mappings = {'excellent': 3, 'good':2, 'crashed': 0, 'normal': 1, 'new': 4}

In [6]:
pipeline = Pipeline([

    # == TEMPORAL VARIABLES ====
    ('elapsed_time', pp.TemporalVariableTransformer(
        variables=TEMPORAL_VARS)),

    # ===================== VARIABLE TRANSFORMATION ======================

    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_VARS)),


    # =========================== mappers ===============================
    ('mapper_qual', pp.Mapper(
        variables=QUAL_VARS, mappings=qual_mappings)),

    # == CATEGORICAL ENCODING
    ('one_hot_encoder', pp.CategoricalOneHotEncoder(
        variables=ONEHOTENCODER_VARS
    )),
])

In [7]:
# train the pipeline
pipeline.fit(X_train, y_train)

In [9]:
X_train = pipeline.transform(X_train)
X_test = pipeline.transform(X_test)

In [10]:
X_train

Unnamed: 0,year,running,status,motor_volume,model_kia,model_mercedes-benz,model_nissan,model_toyota,motor_type_gas,motor_type_hybrid,...,color_red,color_silver,color_skyblue,color_white,type_Universal,type_hatchback,type_minivan / minibus,type_pickup,type_sedan,type_suv
817,13,423.471478,2,1.360932,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
308,5,278.011031,3,1.360932,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1259,27,383.035705,3,1.360932,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
240,4,88.055424,3,1.360932,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1503,5,275.946848,3,1.360932,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,12,445.605836,1,1.257903,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
835,6,326.226333,3,1.360932,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1216,6,106.865425,3,1.360932,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
559,6,362.574598,3,1.360932,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


El valor petrol and gas en la columna motor type, es un conjunto de la variable gas y petrol, por tanto se transformara para que los registros que tengan ese valor activo se eliminara y se editara la variable motor_type gas y petroleo.

In [12]:
X_train['motor_type_gas']=X_train.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_gas'],axis=1)
X_train['motor_type_petrol']=X_train.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_petrol'],axis=1)
X_train=X_train.drop(['motor_type_petrol and gas'], axis=1)

X_test['motor_type_gas']=X_test.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_gas'],axis=1)
X_test['motor_type_petrol']=X_test.apply(lambda x: 1 if x['motor_type_petrol and gas']==1 else x['motor_type_petrol'],axis=1)
X_test=X_test.drop(['motor_type_petrol and gas'], axis=1)

In [13]:
scaler = MinMaxScaler()

# Ajustamos el Scaler para el Conjunto de Entrenamiento
scaler.fit(X_train)

# Transformamos el conjunto de entrenamiento y el conjunto de prueba

# sklearn devuelve arrays de numpy, así que envolvemos el
# array con un dataframe de pandas

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [14]:

X_train.to_csv('./output/xtrain.csv', index=False)
X_test.to_csv('./output/xtest.csv', index=False)

y_train.to_csv('./output/ytrain.csv', index=False)
y_test.to_csv('./output/ytest.csv', index=False)

joblib.dump(scaler, './output/minmax_scaler.joblib')

['./output/minmax_scaler.joblib']