
#### _ML продвинутые методы / ДЗ №1 / Практическая часть / Задача 2_

## 2. _LinearRegression vs Tree-based algorithms_ 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
from sklearn.datasets import load_boston

In [4]:
data = load_boston()
X = data.data
y = data.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=241)

X_train.shape, X_test.shape

((406, 13), (100, 13))

In [6]:
cv = KFold(X_train.shape[0], shuffle=True, random_state=241)

cv.get_n_splits(X_train)

406

In [7]:
def print_metrics(regressor_name, dataset_name, y_test, y_pred):
    print('[{}] MSE ({}): {:.3f}'.format(regressor_name, dataset_name, mean_squared_error(y_test, y_pred)))
    print('[{}] R2 ({}): {:.3f}'.format(regressor_name, dataset_name, r2_score(y_test, y_pred)))
    print('*' * 50)

## 2.1 _DecisionTreeRegressor_ 

In [8]:
%%time

dtr = DecisionTreeRegressor()

dtr_grid_params = {
    'max_features' : [None, 'log2', 'sqrt'],
    'max_depth' : [2, 4, 6, 8, 10, 20, 50],
    'min_samples_leaf' : np.arange(1,20,1)
}

gs_dtr = GridSearchCV(dtr, dtr_grid_params, scoring='neg_mean_squared_error', cv=cv, iid=True, n_jobs=-1)
gs_dtr.fit(X_train, y_train)

best_dtr = gs_dtr.best_estimator_

Wall time: 2min 5s


In [9]:
best_dtr

DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=4,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [10]:
-gs_dtr.best_score_

18.80428547778836

In [11]:
print_metrics('DecisionTreeRegressor', 'train', y_train, best_dtr.predict(X_train))
print_metrics('DecisionTreeRegressor', 'test', y_test, best_dtr.predict(X_test))

[DecisionTreeRegressor] MSE (train): 5.509
[DecisionTreeRegressor] R2 (train): 0.935
**************************************************
[DecisionTreeRegressor] MSE (test): 21.405
[DecisionTreeRegressor] R2 (test): 0.737
**************************************************


## 2.2 _RandomForestRegressor_ 

Параметры для решающего дерева возьмём равными найденным выше для `DecisionTreeRegressor`.

In [12]:
%%time

rfr_params = {k:v for (k,v) in best_dtr.get_params().items() 
                  if k in ['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split']}

rfr = RandomForestRegressor(n_estimators=200, **rfr_params)

print( -cross_val_score(rfr, X_train, y_train, cv=cv, scoring='neg_mean_squared_error').mean() )

14.078186417556498
Wall time: 2min 2s


In [13]:
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
print_metrics('RandomForestRegressor', 'train', y_train, rfr.predict(X_train))
print_metrics('RandomForestRegressor', 'test', y_test, rfr.predict(X_test))

[RandomForestRegressor] MSE (train): 5.798
[RandomForestRegressor] R2 (train): 0.932
**************************************************
[RandomForestRegressor] MSE (test): 13.249
[RandomForestRegressor] R2 (test): 0.837
**************************************************


## 2.3 _LinearRegression_ 

In [15]:
%%time

lr = LinearRegression(n_jobs=-1)

print(-cross_val_score(lr, X_train, y_train, cv=cv, scoring='neg_mean_squared_error').mean())

23.103267524224805
Wall time: 383 ms


In [16]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [17]:
print_metrics('LinearRegression', 'train', y_train, lr.predict(X_train))
print_metrics('LinearRegression', 'test', y_test, lr.predict(X_test))

[LinearRegression] MSE (train): 20.864
[LinearRegression] R2 (train): 0.754
**************************************************
[LinearRegression] MSE (test): 27.780
[LinearRegression] R2 (test): 0.658
**************************************************


Если судить по **score** то качество предсказания линейной регрессии получается сопоставимым с `DecisionTreeRegressor`, но, конечно, проигрывает `RandomForestRegressor`.