<a href="https://colab.research.google.com/github/psaw/hse-ai24-ml/blob/main/Boostings_screencast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Загрузка данных и импорт библиотек

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import r2_score

In [None]:
RANDOM_STATE = 42

In [None]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True)

X = data.data
y = data.target

In [None]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## Сравнение моделей с гиперпараметрами по умолчанию

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbm = GradientBoostingRegressor()

cross_val_score(gbm, X, y, cv=3, scoring='r2').mean()

0.680035098017535

In [None]:
!pip install xgboost -q

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor()

cross_val_score(xgb, X, y, cv=3, scoring='r2').mean()

0.6600054075406734

In [None]:
!pip install catboost -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor(verbose=0)

cross_val_score(cb, X, y, cv=3, scoring='r2').mean()

0.7142210654701769

In [None]:
!pip install lightgbm -q

In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor()

cross_val_score(lgbm, X, y, cv=3, scoring='r2').mean()

0.7016238052098068

## Подбор гиперпараметров

Разобъем данные на тренировочную и тестовую часть. На тренировочной части по кросс-валидации подберем гиперпараметры моделей, а затем проверим качество на тестовой части.

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

params = {'max_depth' : [2, 5, 8, 11]}

In [None]:
%%time

gs_xgb = GridSearchCV(xgb, params, cv=3, scoring='r2', verbose=2)

gs_xgb.fit(Xtrain, ytrain)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................max_depth=2; total time=   3.2s
[CV] END ........................................max_depth=2; total time=   0.5s
[CV] END ........................................max_depth=2; total time=   2.2s
[CV] END ........................................max_depth=5; total time=   1.2s
[CV] END ........................................max_depth=5; total time=   1.2s
[CV] END ........................................max_depth=5; total time=   1.2s
[CV] END ........................................max_depth=8; total time=   2.0s
[CV] END ........................................max_depth=8; total time=   2.0s
[CV] END ........................................max_depth=8; total time=   2.0s
[CV] END .......................................max_depth=11; total time=   4.7s
[CV] END .......................................max_depth=11; total time=   2.9s
[CV] END .......................................m

In [None]:
pred_xgb = gs_xgb.best_estimator_.predict(Xtest)

r2_score(ytest, pred_xgb)

0.8291256023699493

In [None]:
%%time

gs_cb = GridSearchCV(cb, params, cv=3, scoring='r2', verbose=2)

gs_cb.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................max_depth=2; total time=   1.8s
[CV] END ........................................max_depth=2; total time=   1.7s
[CV] END ........................................max_depth=2; total time=   3.5s
[CV] END ........................................max_depth=5; total time=   3.0s
[CV] END ........................................max_depth=5; total time=   3.0s
[CV] END ........................................max_depth=5; total time=   3.0s
[CV] END ........................................max_depth=8; total time=  10.3s
[CV] END ........................................max_depth=8; total time=  10.1s
[CV] END ........................................max_depth=8; total time=  10.3s
[CV] END .......................................max_depth=11; total time=  51.1s
[CV] END .......................................max_depth=11; total time=  49.7s
[CV] END .......................................m

In [None]:
pred_cb = gs_cb.best_estimator_.predict(Xtest)

r2_score(ytest, pred_cb)

0.8911533719179447

In [None]:
%%time

gs_lgbm = GridSearchCV(lgbm, params, cv=3, scoring='r2', verbose=2)

gs_lgbm.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................max_depth=2; total time=   0.1s
[CV] END ........................................max_depth=2; total time=   0.1s
[CV] END ........................................max_depth=2; total time=   0.1s
[CV] END ........................................max_depth=5; total time=   0.2s
[CV] END ........................................max_depth=5; total time=   0.2s
[CV] END ........................................max_depth=5; total time=   0.2s
[CV] END ........................................max_depth=8; total time=   0.3s
[CV] END ........................................max_depth=8; total time=   0.3s
[CV] END ........................................max_depth=8; total time=   0.3s
[CV] END .......................................max_depth=11; total time=   1.2s
[CV] END .......................................max_depth=11; total time=   1.6s
[CV] END .......................................m

In [None]:
pred_lgbm = gs_lgbm.best_estimator_.predict(Xtest)

r2_score(ytest, pred_lgbm)

0.876891981387784

Мы видим, что даже на маленьком датасете и при подборе одного гиперпараметра приходится подождать результатов. А если датасет больше? И гиперпараметров много, и их для достижения оптимального результата нужно подбирать одновременно!

При этом подбор гиперпараметров сильно улучшает качество моделей!

Что же делать, чтобы не ждать вечность, пока ищутся гиперпараметры? Узнаете в следующем уроке :)