# Построение модели на примере полиномиальной регрессии (polynomial regression) с использованием `sklearn.pipeline`

## Подготовка окружения

In [1]:
# ВНИМАНИЕ: необходимо удостовериться, что виртуальная среда выбрана правильно!

# Для MacOS/Ubuntu
# !which pip

# Для Windows
# !where pip

In [2]:
# !conda install matplotlib numpy scikit-learn seaborn scipy -y

In [3]:
import numpy as np

np.__version__

'1.19.2'

In [4]:
import pandas as pd

pd.__version__

'1.2.3'

In [5]:
import scipy
from scipy import stats

scipy.__version__

'1.6.2'

In [6]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.__version__

'3.3.4'

In [7]:
import seaborn as sns

sns.__version__

'0.11.1'

## Загрузка данных

[Источник (FuelConsumption)](https://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64)

In [8]:
df = pd.read_csv("./../../data/FuelConsumptionCo2.csv")
df

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,2014,VOLVO,XC60 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1063,2014,VOLVO,XC60 AWD,SUV - SMALL,3.2,6,AS6,X,13.2,9.5,11.5,25,264
1064,2014,VOLVO,XC70 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1065,2014,VOLVO,XC70 AWD,SUV - SMALL,3.2,6,AS6,X,12.9,9.3,11.3,25,260


## Pipeline

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression


In [10]:
y = df['CO2EMISSIONS'].copy()
X = df.loc[:, df.columns != 'CO2EMISSIONS'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [12]:
# TODO: Outliers

In [13]:
from sklearn.preprocessing import RobustScaler
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

In [14]:
CYLINDERS_transformer = Pipeline(steps=[
    ('replace', FunctionTransformer(lambda x: x.replace(to_replace=
                                                           {3: 4, 
                                                            5: 4, 
                                                            10: 8,
                                                            12: 8}))),
     ('encoder_ord', OrdinalEncoder())])
CYLINDERS_transformer

Pipeline(steps=[('replace',
                 FunctionTransformer(func=<function <lambda> at 0x7f31ed568b80>)),
                ('encoder_ord', OrdinalEncoder())])

In [15]:
ENGINESIZE_transformer = Pipeline(steps=[
    ('round', FunctionTransformer(lambda x: x.astype(float).round(0))),
    ('replace', FunctionTransformer(lambda x: x.replace(to_replace={1: 2, 
                                                                    7: 8, 
                                                                    8: 6}))),
    ('encoder', OrdinalEncoder())
    ])
ENGINESIZE_transformer

Pipeline(steps=[('round',
                 FunctionTransformer(func=<function <lambda> at 0x7f31ed43d700>)),
                ('replace',
                 FunctionTransformer(func=<function <lambda> at 0x7f31ed43d790>)),
                ('encoder', OrdinalEncoder())])

In [16]:
FUELTYPE_transformer = Pipeline(steps=[
    ('replace', FunctionTransformer(lambda x: x.replace('D', 'X'))), 
    ('encoder_oh', OneHotEncoder()) 
    ])
FUELTYPE_transformer

Pipeline(steps=[('replace',
                 FunctionTransformer(func=<function <lambda> at 0x7f31ed43da60>)),
                ('encoder_oh', OneHotEncoder())])

In [17]:
numeric_features = ['FUELCONSUMPTION_COMB_MPG']
cat_ord_features = ['CYLINDERS', 
#                     'ENGINESIZE'
                   ]
cat_unord_features = ['FUELTYPE']

In [18]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', df.columns.difference(['FUELCONSUMPTION_COMB_MPG', 'CYLINDERS', 'ENGINESIZE', 'FUELTYPE', 'CO2EMISSIONS']).tolist()),
        ('categorical_CYLINDERS', CYLINDERS_transformer, ['CYLINDERS']),
        ('categorical_ENGINESIZE', ENGINESIZE_transformer, ['ENGINESIZE']),
        ('categorical_FUELTYPE', FUELTYPE_transformer, ['FUELTYPE']),
        ('numeric', numeric_transformer, numeric_features),
        
]) 
preprocessor

ColumnTransformer(transformers=[('drop', 'drop',
                                 ['FUELCONSUMPTION_CITY',
                                  'FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_HWY',
                                  'MAKE', 'MODEL', 'MODELYEAR', 'TRANSMISSION',
                                  'VEHICLECLASS']),
                                ('categorical_CYLINDERS',
                                 Pipeline(steps=[('replace',
                                                  FunctionTransformer(func=<function <lambda> at 0x7f31ed568b80>)),
                                                 ('encoder_ord',
                                                  OrdinalEncoder())]),
                                 ['CYLINDERS']),
                                ('categoric...
                                                  FunctionTransformer(func=<function <lambda> at 0x7f31ed43d790>)),
                                                 ('encoder',
                                    

In [19]:
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
model = pipeline.fit(X_train, y_train)
model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('drop', 'drop',
                                                  ['FUELCONSUMPTION_CITY',
                                                   'FUELCONSUMPTION_COMB',
                                                   'FUELCONSUMPTION_HWY',
                                                   'MAKE', 'MODEL', 'MODELYEAR',
                                                   'TRANSMISSION',
                                                   'VEHICLECLASS']),
                                                 ('categorical_CYLINDERS',
                                                  Pipeline(steps=[('replace',
                                                                   FunctionTransformer(func=<function <lambda> at 0x7f31ed568b80>)),
                                                                  ('encoder_ord',
                                                                   OrdinalEncoder...
       

In [20]:
pd.DataFrame(preprocessor.transform(X_test))

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,1.0,0.0,0.4
1,1.0,2.0,0.0,1.0,0.0,-0.2
2,0.0,0.0,0.0,0.0,1.0,0.8
3,0.0,0.0,0.0,1.0,0.0,1.4
4,2.0,4.0,0.0,0.0,1.0,-0.8
...,...,...,...,...,...,...
209,2.0,3.0,1.0,0.0,0.0,-1.1
210,0.0,0.0,0.0,1.0,0.0,0.7
211,0.0,0.0,0.0,1.0,0.0,0.9
212,2.0,3.0,0.0,0.0,1.0,-0.7


In [21]:
preprocessor.transform(X_test)

array([[ 0. ,  0. ,  0. ,  1. ,  0. ,  0.4],
       [ 1. ,  2. ,  0. ,  1. ,  0. , -0.2],
       [ 0. ,  0. ,  0. ,  0. ,  1. ,  0.8],
       ...,
       [ 0. ,  0. ,  0. ,  1. ,  0. ,  0.9],
       [ 2. ,  3. ,  0. ,  0. ,  1. , -0.7],
       [ 2. ,  4. ,  0. ,  0. ,  1. , -0.4]])

In [22]:
from sklearn.metrics import r2_score

# оценить модель
predictions = model.predict(X_test)

# Наилучшая возможная оценка - 1.0
print(r2_score(predictions, y_test))

0.8994333778885178
