In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import Libs.preprocessors as pp

# Feature Engineering

In [25]:
df=pd.read_csv('data/train.csv')
features=pd.read_csv('output/selected_features.csv')

In [26]:
df['running']=df['running'].apply(lambda x: float(x.replace('km','')) if x[-2:]=='km' else float(x.replace('miles',''))*1.609344)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['price','wheel'], axis=1), # predictive variables
    df['price'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((1477, 8), (165, 8))

In [28]:
X_train

Unnamed: 0,model,year,motor_type,running,color,type,status,motor_volume
817,hyundai,2011,petrol,125000.0000,black,sedan,good,2.0
308,kia,2019,petrol,49084.9920,blue,sedan,excellent,2.0
1259,mercedes-benz,1997,gas,100000.0000,silver,sedan,excellent,2.0
240,hyundai,2020,petrol,3900.0000,white,sedan,excellent,2.0
1503,kia,2019,petrol,48280.3200,other,sedan,excellent,2.0
...,...,...,...,...,...,...,...,...
763,hyundai,2012,petrol,140000.0000,white,sedan,normal,1.8
835,mercedes-benz,2018,petrol,70000.0000,black,sedan,excellent,2.0
1216,mercedes-benz,2018,petrol,5954.5728,black,sedan,excellent,2.0
559,hyundai,2018,petrol,88513.9200,silver,sedan,excellent,2.0


In [29]:
ONEHOTENCODER_VARS = ['model', 'motor_type','color','type']

TEMPORAL_VARS = ['year']

NUMERICALS_YEO_VARS = ["running", "motor_volume"]

QUAL_VARS = [
    'status']

qual_mappings = {'excellent': 3, 'good':2, 'crashed': 0, 'normal': 1, 'new': 4}

In [30]:
pipeline = Pipeline([

    # == TEMPORAL VARIABLES ====
    ('elapsed_time', pp.TemporalVariableTransformer(
        variables=TEMPORAL_VARS)),

    # ===================== VARIABLE TRANSFORMATION ======================

    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_VARS)),


    # =========================== mappers ===============================
    ('mapper_qual', pp.Mapper(
        variables=QUAL_VARS, mappings=qual_mappings)),

    # == CATEGORICAL ENCODING
    ('one_hot_encoder', pp.CategoricalOneHotEncoder(
        variables=ONEHOTENCODER_VARS
    )),    

    ('column_selector', pp.ColumnSelector(columns=features['0'].values)),

    ('ColumnsTRansformOHE',pp.MotorTypeTransformer()),


    ('scaler', MinMaxScaler()),
])

In [31]:
# train the pipeline
pipeline.fit(X_train, y_train)

In [32]:
X_train = pipeline.transform(X_train)
X_test = pipeline.transform(X_test)

In [33]:
X_train

array([[0.33333333, 0.35329447, 0.5       , ..., 0.        , 1.        ,
        0.        ],
       [0.11111111, 0.23069497, 0.75      , ..., 0.        , 1.        ,
        0.        ],
       [0.72222222, 0.31921369, 0.75      , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.13888889, 0.08644706, 0.75      , ..., 0.        , 1.        ,
        0.        ],
       [0.13888889, 0.3019683 , 0.75      , ..., 0.        , 1.        ,
        0.        ],
       [0.13888889, 0.35329447, 0.75      , ..., 0.        , 1.        ,
        0.        ]])

El valor petrol and gas en la columna motor type, es un conjunto de la variable gas y petrol, por tanto se transformara para que los registros que tengan ese valor activo se eliminara y se editara la variable motor_type gas y petroleo.

In [23]:
X_train

Unnamed: 0,year,running,status,motor_volume,model_kia,model_mercedes-benz,model_nissan,model_toyota,motor_type_gas,motor_type_hybrid,...,color_purple,color_red,color_silver,color_skyblue,color_white,type_Universal,type_hatchback,type_pickup,type_sedan,type_suv
0,0.333333,0.353294,0.50,0.579041,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.111111,0.230695,0.75,0.579041,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.722222,0.319214,0.75,0.579041,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.083333,0.070593,0.75,0.579041,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.111111,0.228955,0.75,0.579041,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,0.305556,0.371950,0.25,0.528147,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1473,0.138889,0.271333,0.75,0.579041,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1474,0.138889,0.086447,0.75,0.579041,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1475,0.138889,0.301968,0.75,0.579041,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:

X_train.to_csv('./output/xtrain.csv', index=False)
X_test.to_csv('./output/xtest.csv', index=False)

y_train.to_csv('./output/ytrain.csv', index=False)
y_test.to_csv('./output/ytest.csv', index=False)

['./output/minmax_scaler.joblib']