In [23]:
#загрузим основные библиотеки
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

import joblib

In [2]:
df_wine = pd.read_csv('data/Red.csv')

In [3]:
df_wine.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [4]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


Создание pipeline

In [7]:
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country']))
print(ct)

ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                 ['Region']),
                                ('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])


In [8]:
pipeline = Pipeline([('ct', ct), ('rf', RandomForestRegressor())])

In [9]:
X = df_wine[['Region', 'Country', 'Price']]
y = df_wine['Rating']

In [10]:
pipeline.fit(X, y)

In [11]:
df_wine_test = pd.read_csv('data/Red_test.csv')

In [12]:
X_test = df_wine_test[['Region', 'Country', 'Price']]
y_test = df_wine_test['Rating']

In [13]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared = False)

In [14]:
y_pred = pipeline.predict(X_test)
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по RSME: 0.0779


In [15]:
joblib.dump(pipeline, 'pipeline_vine.pkl')

['pipeline_vine.pkl']

In [16]:
pipeline.get_params()

{'memory': None,
 'steps': [('ct',
   ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                    ['Region']),
                                   ('standardscaler', StandardScaler(), ['Price']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['Country'])])),
  ('rf', RandomForestRegressor())],
 'verbose': False,
 'ct': ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                  ['Region']),
                                 ('standardscaler', StandardScaler(), ['Price']),
                                 ('onehotencoder', OneHotEncoder(),
                                  ['Country'])]),
 'rf': RandomForestRegressor(),
 'ct__n_jobs': None,
 'ct__remainder': 'drop',
 'ct__sparse_threshold': 0.3,
 'ct__transformer_weights': None,
 'ct__transformers': [('ordinalencoder', OrdinalEncoder(), ['Region']),
  ('standardscaler', StandardScaler(), ['Pri

Новые параметры

In [17]:
pipeline.set_params(rf__n_estimators=200)

In [18]:
pipeline.fit(X, y)

In [19]:
y_pred = pipeline.predict(X_test)
print(f'Качество по RSME: {round(rmse(y_test, y_pred),4)}')

Качество по RSME: 0.0782


Добавление stacking

In [25]:
# Создаем список кортежей вида: (наименование модели, модель)
estimators = [
    ('lr', RidgeCV()),
    ('dt',  DecisionTreeRegressor(random_state=42))
]

# Создаем объект класса стекинг
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42)
)

In [26]:
mod_pipeline = Pipeline([('ct', ct), ('sr', reg)])

In [27]:
mod_pipeline.fit(X, y)

In [29]:
y_pred = mod_pipeline.predict(X_test)
print(f'Качество по RSME: {round(rmse(y_test, y_pred),2)}')

Качество по RSME: 0.18
