
#### _ML продвинутые методы / ДЗ №1 / Практическая часть / Задача 3_

## 3. _Boosting vs Bagging_ 

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.datasets import load_diabetes

data = load_diabetes()

X = data.data
y = data.target

X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=223)

print(X_train.shape, X_test.shape)

### 3.1 _DecisionTreeRegressor_ 

In [None]:
%%time

dtr = DecisionTreeRegressor()

dtr_grid_params = {
    'max_depth' : list(range(2, 10)),
    'max_features' : [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0],
    'min_samples_leaf' : list(range(1, 15))
}

gs_dtr = GridSearchCV(dtr, dtr_grid_params, scoring='neg_mean_squared_error', cv=4, iid=True, n_jobs=-1)
gs_dtr.fit(X_train, y_train)

best_dtr = gs_dtr.best_estimator_

In [None]:
best_dtr

In [None]:
-gs_dtr.best_score_

In [None]:
def print_scores(regr_name, dataset_name, y_test, y_pred):
    print('[{}] R2 ({}): {:.3f}'.format(regr_name, dataset_name, r2_score(y_test, y_pred)))
    print('[{}] MSE ({}): {:.3f}'.format(regr_name, dataset_name, mean_squared_error(y_test, y_pred)))
    print()
    print('*' * 50)

In [None]:
y_dtr_pred = best_dtr.predict(X_test)

In [None]:
print_scores('DecisionTreeRegressor', 'train', y_train, best_dtr.predict(X_train))
print_scores('DecisionTreeRegressor', 'test', y_test, y_dtr_pred)

### 3.2 _RandomForestRegressor_

Параметры для решающего дерева возьмём равными найденным выше для `DecisionTreeRegressor`.

In [None]:
%%time

rfr_params = {k:v for (k,v) in best_dtr.get_params().items() 
                  if k in ['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split']}

rfr = RandomForestRegressor(n_estimators=50, **rfr_params)

print( -cross_val_score(rfr, X_train, y_train, cv=4, scoring='neg_mean_squared_error').mean() )

In [None]:
rfr.fit(X_train, y_train)

In [None]:
y_rfr_pred = rfr.predict(X_test) 

In [None]:
print_scores('RandomForestRegressor', 'train', y_train, rfr.predict(X_train))
print_scores('RandomForestRegressor', 'test', y_test, y_rfr_pred)

#### 3.2.1 _Number of trees in RandomForest_

In [None]:
from tqdm import tqdm

In [None]:
n_trees = np.arange(5,400,10)
scores = []

rfr_params = {k:v for (k,v) in rfr.get_params().items() 
                        if k in ['max_depth', 'max_features', 'min_samples_leaf', 
                                 'min_samples_split', 'min_weight_fraction_leaf']}

for n in tqdm(n_trees):
    rfr = RandomForestRegressor(n_estimators=n, n_jobs=-1, **rfr_params)
    scores.append(-cross_val_score(rfr, X, y, cv=4, scoring='neg_mean_squared_error').mean())

In [None]:
plt.figure(figsize = (15, 7))
plt.plot(scores)
plt.xticks(np.arange(len(n_trees)), n_trees, rotation='vertical')
plt.grid(axis='y')
plt.ylabel('MSE')
plt.xlabel('n_trees')
plt.title('RandomForestRegressor');

### 3.3 _GradientBoostingRegressor_

In [None]:
%%time

gbr = GradientBoostingRegressor(n_estimators=10)

gbr_grid_params = {
    'max_depth' : list(range(1, 6)),
    'learning_rate' : np.linspace(0.1, 1, 10),
    'max_features' : [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0],
    'min_samples_leaf' : list(range(1, 10))
}

gs_gbr = GridSearchCV(gbr, gbr_grid_params, scoring='neg_mean_squared_error', cv=4, iid=True, n_jobs=-1)
gs_gbr.fit(X_train, y_train)

best_gbr = gs_gbr.best_estimator_

In [None]:
best_gbr

In [None]:
y_gbr_pred = best_gbr.predict(X_test) 

In [None]:
print_scores('GradientBoostingRegressor', 'train', y_train, best_gbr.predict(X_train))
print_scores('GradientBoostingRegressor', 'test', y_test, y_gbr_pred)

#### 3.3.1 _Number of trees in GradientBoosting_

In [None]:
n_trees = np.concatenate((np.array([1]), np.arange(5, 205, 5)))
scores = []

for n in tqdm(n_trees):
    best_gbr.n_estimators = n
    scores.append(-cross_val_score(best_gbr, X, y, cv=4, scoring='neg_mean_squared_error').mean())

In [None]:
plt.figure(figsize = (15, 7))
plt.plot(scores)
plt.xticks(np.arange(len(n_trees)), n_trees, rotation='vertical')
plt.grid(axis='y')
plt.ylabel('MSE')
plt.xlabel('n_trees')
plt.title('GradientBoostingRegressor');