In [11]:
import time

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
X, y = load_boston(return_X_y=True)

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

#### Создаю массив словарей с параметрами наших моделей. Параметры подбирал методом "научного тыка". Читал документацию, выбирал 2 интересных и добавлял разброс к значению по умолчанию.

In [12]:
models = [
    {
        'name': 'PLSRegression',
        'regressor': PLSRegression(),
        'params': {'fit__n_components': list(range(2, 10)), 'fit__scale': [True, False]},
        'grid': None,
    },
    {
        'name': 'RandomForestRegressor',
        'regressor': RandomForestRegressor(),
        'params': {'fit__n_estimators': list(range(8, 18)), 'fit__max_depth': list(range(5, 10))},
        'grid': None,
    },
    {
        'name': 'GradientBoostingRegressor',
        'regressor': GradientBoostingRegressor(),
        'params': {'fit__loss': ['ls', 'huber'], 'fit__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]},
        'grid': None,
    },
    {
        'name': 'LinearRegression',
        'regressor': LinearRegression(),
        'params': {'fit__fit_intercept': [True, False]},
        'grid': None,
    },
    {
        'name': 'ARDRegression',
        'regressor': ARDRegression(),
        'params': {'fit__alpha_1': [1e-6, 2e-6, 3e-6], 'fit__alpha_2': [1e-6, 2e-6, 3e-6]},
        'grid': None,
    },
    {
        'name': 'HuberRegressor',
        'regressor': HuberRegressor(),
        'params': {'fit__epsilon': [1.2, 1.3, 1.4, 1.5], 'fit__alpha': [0.0001, 0.0002, 0.0003, 0.0004]},
        'grid': None,
    },
    {
        'name': 'PassiveAggressiveRegressor',
        'regressor': PassiveAggressiveRegressor(),
        'params': {'fit__C': [1, 1.1, 1.2, 1.3, 2], 'fit__validation_fraction': [0.01, 0.1, 0.2, 0.3, 0.4]},
        'grid': None,
    },
    {
        'name': 'KNeighborsRegressor',
        'regressor': KNeighborsRegressor(),
        'params': {'fit__n_neighbors': list(range(3, 10)), 'fit__weights': ['uniform', 'distance']},
        'grid': None,
    },
    {
        'name': 'LinearSVR',
        'regressor': LinearSVR(),
        'params': {'fit__epsilon': [0.05, 0.1, 0.2, 0.5, 1.0], 'fit__C': [1, 1.1, 1.2, 1.3, 2]},
        'grid': None,
    },
]

#### По всем словарям проводим обучение, добавив шаг с нормализацией данных до обучения.

In [14]:
for model in models:
    start = int(round(time.time() * 1000))
    
    pipe = Pipeline([('scale', StandardScaler()), ('fit', model['regressor'])])
    grid = GridSearchCV(pipe, model['params'], scoring='r2', cv=10)
    grid.fit(X_train, y_train)
    model['grid'] = grid
    
    print('{} - {} seconds'.format(model['name'], (int(round(time.time() * 1000)) - start) / 1000))

PLSRegression - 2.602 seconds
RandomForestRegressor - 43.919 seconds
GradientBoostingRegressor - 53.191 seconds
LinearRegression - 0.242 seconds
ARDRegression - 50.306 seconds
HuberRegressor - 5.678 seconds
PassiveAggressiveRegressor - 2.44 seconds
KNeighborsRegressor - 2.828 seconds
LinearSVR - 3.49 seconds


#### Выводим оценку для всех обученных моделей.

In [15]:
for model in models:
    print('------------------------')
    print(model['name'])
    print(model['grid'].best_params_)
    print("CV Accuracy:", model['grid'].best_score_, "Validation Accuracy:", model['grid'].best_estimator_.score(X_valid, y_valid))

------------------------
PLSRegression
{'fit__n_components': 8, 'fit__scale': True}
CV Accuracy: 0.685943392008014 Validation Accuracy: 0.7671408854324728
------------------------
RandomForestRegressor
{'fit__max_depth': 7, 'fit__n_estimators': 16}
CV Accuracy: 0.8458435384843955 Validation Accuracy: 0.8860880217187883
------------------------
GradientBoostingRegressor
{'fit__learning_rate': 0.1, 'fit__loss': 'ls'}
CV Accuracy: 0.8549642566056996 Validation Accuracy: 0.9194986195499074
------------------------
LinearRegression
{'fit__fit_intercept': True}
CV Accuracy: 0.684843696213451 Validation Accuracy: 0.7669868767472325
------------------------
ARDRegression
{'fit__alpha_1': 3e-06, 'fit__alpha_2': 1e-06}
CV Accuracy: 0.6834654747827424 Validation Accuracy: 0.7678593983154912
------------------------
HuberRegressor
{'fit__alpha': 0.0004, 'fit__epsilon': 1.5}
CV Accuracy: 0.6764684560861053 Validation Accuracy: 0.7333557319939314
------------------------
PassiveAggressiveRegressor
{

#### Интересное наблюдение, что почти все модели показали лучшие результаты на валидационной выборке. Линейные модели имеют примерно одинаковое качество ~ 70%. Но алгоритмы предназначенные для классификации показали более высокие результаты.