# Построение модели на примере полиномиальной регрессии (polynomial regression) с использованием `sklearn.pipeline`

## Подготовка окружения

In [None]:
# ВНИМАНИЕ: необходимо удостовериться, что виртуальная среда выбрана правильно!

# Для MacOS/Ubuntu
# !which pip

# Для Windows
# !where pip

In [None]:
# !conda install matplotlib numpy scikit-learn seaborn scipy -y

In [None]:
import numpy as np

np.__version__

In [None]:
import pandas as pd

pd.__version__

In [None]:
import scipy
from scipy import stats

scipy.__version__

In [None]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.__version__

In [None]:
import seaborn as sns

sns.__version__

## Загрузка данных

[Источник (Churn Modelling)](https://www.kaggle.com/shrutimechlearn/churn-modelling)

In [None]:
df = pd.read_csv("./../../data/Churn_Modelling.csv")
df

## Преобразование типов и очистка

In [None]:
# удалить бессмысленную колонку
del df['MODELYEAR']

In [None]:
# преобразовать в правильные типы
df.MODEL = df.MODEL.astype("string")

for col_name in df.select_dtypes(include='object'):
    df[col_name] = df[col_name].astype("category")

In [None]:
df.ENGINESIZE = df.ENGINESIZE.astype(pd.CategoricalDtype(ordered=True))
df.CYLINDERS = df.CYLINDERS.astype(pd.CategoricalDtype(ordered=True))

## Разделение данных на `X` и `y`

In [None]:
from sklearn.model_selection import train_test_split

y = df['CO2EMISSIONS'].copy()
X = df.loc[:, df.columns != 'CO2EMISSIONS'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Удаление выбросов

In [None]:
def remove_outliers_iqr(data, col_name):
    print(f"column name: {col_name}\n")
    Q1 = data[col_name].quantile(0.25)
    Q3 = data[col_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_fence = Q1 - 1.5 * IQR
    upper_fence = Q3 + 1.5 * IQR
    data_new = data[~((data[col_name] < lower_fence) |
                      (data[col_name] > upper_fence))]

    diff_len = len(data) - len(data_new)
    print(f"removed: {diff_len}")

    _, axs = plt.subplots(1, 2, figsize=(10, 2))
    sns.boxplot(data=data_new[col_name], ax=axs[0])
    sns.histplot(data=data_new[col_name], ax=axs[1])

    return data_new.reset_index(drop=True)

In [None]:
df_tmp = X_train.join(y_train).reset_index(drop=True)
print('len(df_tmp) BEFORE:', len(df_tmp))
for col_name in df.select_dtypes(include=np.number):
    df_tmp = remove_outliers_iqr(df_tmp, col_name)
    print("-"*30, '\n')
print('len(df_tmp) AFTER:', len(df_tmp))

X_train = df_tmp.loc[:, df_tmp.columns != 'CO2EMISSIONS'].copy()
y_train = df_tmp['CO2EMISSIONS'].copy()

## Pipeline

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

In [None]:
CYLINDERS_transformer = Pipeline(steps=[
    ('replace', FunctionTransformer(lambda x: x.replace(to_replace=
                                                           {3: 4,
                                                            5: 4,
                                                            10: 8,
                                                            12: 8}))),
     ('encoder_ord', OrdinalEncoder())])
CYLINDERS_transformer

In [None]:
ENGINESIZE_transformer = Pipeline(steps=[
    ('round', FunctionTransformer(lambda x: x.astype(float).round(0))),
    ('replace', FunctionTransformer(lambda x: x.replace(to_replace={1: 2,
                                                                    7: 8,
                                                                    8: 6}))),
    ('encoder', OrdinalEncoder())
    ])
ENGINESIZE_transformer

In [None]:
FUELTYPE_transformer = Pipeline(steps=[
    ('replace', FunctionTransformer(lambda x: x.replace('D', 'X'))),
    ('encoder_oh', OneHotEncoder())
    ])
FUELTYPE_transformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical_CYLINDERS', CYLINDERS_transformer, ['CYLINDERS']),
        ('categorical_ENGINESIZE', ENGINESIZE_transformer, ['ENGINESIZE']),
        ('categorical_FUELTYPE', FUELTYPE_transformer, ['FUELTYPE']),
        ('numeric_FUELCONSUMPTION_COMB_MPG', numeric_transformer, ['FUELCONSUMPTION_COMB_MPG']),
        ('drop', 'drop', df.columns.difference(['FUELCONSUMPTION_COMB_MPG', 'CYLINDERS', 'ENGINESIZE', 'FUELTYPE', 'CO2EMISSIONS']).tolist()),
])

In [None]:
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures()),
    ('model', LinearRegression())
])

In [None]:
# Параметры конвейеров могут быть заданы с использованием имен параметров, разделенных __
param_grid = {
    'poly__degree': [1, 2, 3]
}
model_gcv = GridSearchCV(pipeline, param_grid, n_jobs=-1)
model_gcv.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % model_gcv.best_score_)
print(model_gcv.best_params_)

In [None]:
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format

# set config to diagram for visualizing the pipelines/composite estimators
set_config(display='diagram')

# saving pipeline as html format
with open('./tmp/pipeline.html', 'w') as f:
    f.write(estimator_html_repr(model_gcv.best_estimator_))

# the best estimator from grid search
model_gcv.best_estimator_

In [None]:
model_gcv.best_estimator_['poly'].n_input_features_, model_gcv.best_estimator_['poly'].n_output_features_

In [None]:
model_gcv.best_estimator_['poly'].powers_

In [None]:
# оценить модель
y_predicted = model_gcv.predict(X_test)

print("Mean absolute error: %.2f" % np.mean(np.absolute(y_predicted - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_predicted - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_true=y_test, y_pred=y_predicted))

## Cross-validation

[sklearn.model_selection.cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_gcv.best_estimator_, X_test, y_test, cv=5)
print(f"scores: {scores}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))