## Практика по оптимизации гиперпараметров

### Базовая оптимизация

In [57]:
#импорт библиотек
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации
import hyperopt
import optuna

from sklearn import linear_model #линейные моделиё
from sklearn import tree #деревья решений
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики
from sklearn import preprocessing #предобработка
from sklearn.model_selection import train_test_split #сплитование выборки
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

%matplotlib inline
plt.style.use('seaborn')

  plt.style.use('seaborn')


### Описание задачи

Необходимо предсказать биологический ответ молекул (столбец 'Activity') по их химическому составу (столбцы D1-D1776).

Данные представлены в формате CSV.  Каждая строка представляет молекулу. 

+ Первый столбец Activity содержит экспериментальные данные, описывающие фактический биологический ответ [0, 1]; 
+ Остальные столбцы D1-D1776 представляют собой молекулярные дескрипторы — это вычисляемые свойства, которые могут фиксировать некоторые характеристики молекулы, например размер, форму или состав элементов.
Предварительная обработка не требуется, данные уже закодированы и нормализованы.

В качестве метрики будем использовать F1-score.

Необходимо обучить две модели: логистическую регрессию и случайный лес. Далее нужно сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

### Знакомство с данными и их исследование

In [58]:
data = pd.read_csv('data/_train_sem09.csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


Создаем матрицу наблюдений $X$ и вектор ответов $y$

In [59]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

Разделяем выборку на тренировочную и тестовую в соотношении 80/20. Для сохранения соотношений целевого признака используем параметр stratify (стратифицированное разбиение). 

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 1, test_size = 0.2)

### Оптимизация гиперпараметров модели

#### **Логистическая регрессия**

Зафиксируем только метрики, которые были получены без дополнительной настройки, т.е со значениями гиперпараметров, установленных по умолчанию:

In [61]:
#Создаем объект класса логистическая регрессия
log_reg = linear_model.LogisticRegression(max_iter = 50)
#Обучаем модель, минимизируя logloss
log_reg.fit(X_train, y_train)

y_test_pred = log_reg.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.79


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### **Случайный лес**

In [62]:
#Создаем объект класса случайный лес
rf = ensemble.RandomForestClassifier(random_state=42)

#Обучаем модель
rf.fit(X_train, y_train)
#Выводим значения метрики 
y_test_pred = rf.predict(X_test)
print('f1_score на тестовом наборе случ лес: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе случ лес: 0.81


Модели с значением гиперпараметров по умолчанию показали следующие значения метрики f1:
- логистическая регрессия - 0,79
- случайный лес - 0,81

### <center> **GridSearchCV**

#### **Логистическая регрессия**

In [63]:
param_grid = {'penalty': ['l2', 'none'] ,#тип регурялизации
              'solver': ['lbfgs', 'saga'], #алгоритм оптимизации
              }
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42, #генератор случайных чисел
        max_iter=50 #количество итераций на сходимость
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search.fit(X_train, y_train) 

y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

CPU times: total: 2.33 s
Wall time: 13 s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'penalty': 'l2', 'solver': 'saga'}




Значение метрики модели логистической регрессии немного ухудшилось(0,78), вероятно, за 50 итераций модель GridSearchCV не смогла подобрать параметры, хотя бы сопоставимые по эффективности с параметрами по умолчанию.

#### **Случайный лес**

In [64]:
param_grid = {'n_estimators': list(range(80, 200, 30)),
              'min_samples_leaf': [5],
              'max_depth': list(np.linspace(20, 40, 10, dtype=int))
              }

grid_search = GridSearchCV(
        estimator=ensemble.RandomForestClassifier(
        random_state=42
    ), 
    param_grid=param_grid, 
    cv=5, # количество фолдов при кросс-валидации
    n_jobs = -1
)  
%time grid_search.fit(X_train, y_train)
y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

CPU times: total: 3.7 s
Wall time: 1min 20s
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 140}


Модель случайного леса с гиперпараметрами, подобранными с помощью GridSearchCV, немного улучшила метрику f1, показав на тестовом наборе значение 0,82

### <center> **RandomizedSearchCV**

#### **Логистическая регрессия**

In [65]:
param_distributions = {'penalty': ['l2', 'none'] ,
              'solver': ['lbfgs', 'saga'],
}
            
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42), 
    param_distributions=param_distributions, 
    cv=5, # количество фолдов при кросс-валидации
    n_iter = 50, 
    n_jobs = -1
)  
%time random_search.fit(X_train, y_train) 
y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))



CPU times: total: 4.3 s
Wall time: 15.8 s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'solver': 'saga', 'penalty': 'l2'}




Подбор гиперпараметров с помощью RandomizedSearchCV в модели логистической регрессии дал такие же результаты(0,78), что и с подбором по сетке.

#### **Случайный лес**

In [72]:
param_distributions = {'n_estimators': list(range(80, 200, 30)),
              'min_samples_leaf': [5],
              'max_depth': list(np.linspace(20, 40, 10, dtype=int))
}
            
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_distributions=param_distributions, 
    cv=5, # количество фолдов при кросс-валидации
    n_iter = 50, 
    n_jobs = -1
)  
%time random_search_forest.fit(X_train, y_train) 
y_test_pred = random_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))



CPU times: total: 3.72 s
Wall time: 1min 31s
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'n_estimators': 140, 'min_samples_leaf': 5, 'max_depth': 20}


Подбор гиперпараметров с помощью RandomizedSearchCV в модели случайного леса дал такие же результаты(0,82), что и с подбором по сетке.

### <center> Hyperopt

#### **Логистическая регрессия**

In [74]:
space={'penalty': hp.choice('penalty', ['l2', 'none']),
              'solver': hp.choice('solver', ['lbfgs', 'sag']),
              'C': hp.uniform('C', 0.01, 1)
}
random_state = 42
def hyperopt_lr(space, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    space = {'penalty': space['penalty'], 
             'solver': space['solver'], 
             'C': float(space['C'])
    }
  
    # используем эту комбинацию для построения модели
    model = linear_model.LogisticRegression(**space, random_state=random_state)

    # обучаем модель
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()
    return {'loss': -score, 'params': space, 'status': STATUS_OK}
 
# начинаем подбор гиперпараметров

trials = Trials()

best=fmin(hyperopt_lr, 
          space=space, 
          algo=tpe.suggest,
          max_evals=50,
          trials=trials,
          rstate=np.random.default_rng(random_state)
         )
print("Наилучшие значения гиперпараметров {}".format(trials.best_trial['result']['params']))

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



  2%|▏         | 1/50 [00:04<03:27,  4.24s/trial, best loss: -0.7475536741581407]




  4%|▍         | 2/50 [00:13<05:47,  7.24s/trial, best loss: -0.7737602789410036]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



  6%|▌         | 3/50 [00:16<03:59,  5.09s/trial, best loss: -0.7829442049757983]




  8%|▊         | 4/50 [00:24<04:57,  6.46s/trial, best loss: -0.7829442049757983]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 10%|█         | 5/50 [00:27<03:44,  4.99s/trial, best loss: -0.7829442049757983]






 12%|█▏        | 6/50 [00:36<04:42,  6.42s/trial, best loss: -0.7829442049757983]






 14%|█▍        | 7/50 [00:44<05:06,  7.12s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 16%|█▌        | 8/50 [00:47<03:56,  5.62s/trial, best loss: -0.7829442049757983]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 18%|█▊        | 9/50 [00:49<03:09,  4.63s/trial, best loss: -0.7829442049757983]




 20%|██        | 10/50 [00:58<03:50,  5.77s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 22%|██▏       | 11/50 [01:00<03:06,  4.79s/trial, best loss: -0.7829442049757983]






 24%|██▍       | 12/50 [01:08<03:40,  5.80s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 26%|██▌       | 13/50 [01:10<02:54,  4.73s/trial, best loss: -0.7829442049757983]




 28%|██▊       | 14/50 [01:19<03:30,  5.84s/trial, best loss: -0.7829442049757983]




 30%|███       | 15/50 [01:27<03:50,  6.59s/trial, best loss: -0.7829442049757983]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 32%|███▏      | 16/50 [01:30<03:00,  5.32s/trial, best loss: -0.7829442049757983]






 34%|███▍      | 17/50 [01:38<03:24,  6.19s/trial, best loss: -0.7829442049757983]




 36%|███▌      | 18/50 [01:46<03:39,  6.85s/trial, best loss: -0.7829442049757983]




 38%|███▊      | 19/50 [01:54<03:45,  7.27s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 40%|████      | 20/50 [01:57<02:53,  5.78s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 42%|████▏     | 21/50 [01:59<02:17,  4.76s/trial, best loss: -0.7829442049757983]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 46%|████▌     | 23/50 [02:03<01:31,  3.38s/trial, best loss: -0.7834946820433192]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 52%|█████▏    | 26/50 [02:09<00:59,  2.46s/trial, best loss: -0.7836127308271907]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 56%|█████▌    | 28/50 [02:14<00:49,  2.24s/trial, best loss: -0.7836127308271907]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 58%|█████▊    | 29/50 [02:16<00:47,  2.24s/trial, best loss: -0.7836127308271907]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 60%|██████    | 30/50 [02:18<00:45,  2.26s/trial, best loss: -0.7836127308271907]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 62%|██████▏   | 31/50 [02:20<00:43,  2.27s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 64%|██████▍   | 32/50 [02:23<00:41,  2.28s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 66%|██████▌   | 33/50 [02:25<00:39,  2.33s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 68%|██████▊   | 34/50 [02:27<00:37,  2.32s/trial, best loss: -0.7837412865947291]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 70%|███████   | 35/50 [02:30<00:34,  2.31s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 72%|███████▏  | 36/50 [02:32<00:32,  2.34s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 74%|███████▍  | 37/50 [02:35<00:31,  2.42s/trial, best loss: -0.7837412865947291]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 76%|███████▌  | 38/50 [02:37<00:29,  2.47s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 78%|███████▊  | 39/50 [02:40<00:27,  2.47s/trial, best loss: -0.7837412865947291]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 80%|████████  | 40/50 [02:42<00:23,  2.39s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 82%|████████▏ | 41/50 [02:44<00:21,  2.36s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 84%|████████▍ | 42/50 [02:47<00:19,  2.41s/trial, best loss: -0.7837412865947291]






 86%|████████▌ | 43/50 [02:55<00:28,  4.13s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 88%|████████▊ | 44/50 [02:57<00:21,  3.57s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 90%|█████████ | 45/50 [03:00<00:15,  3.19s/trial, best loss: -0.7837412865947291]






 92%|█████████▏| 46/50 [03:08<00:19,  4.75s/trial, best loss: -0.7837412865947291]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 94%|█████████▍| 47/50 [03:11<00:12,  4.11s/trial, best loss: -0.7837412865947291]




 96%|█████████▌| 48/50 [03:19<00:10,  5.42s/trial, best loss: -0.7837412865947291]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 98%|█████████▊| 49/50 [03:22<00:04,  4.54s/trial, best loss: -0.7837412865947291]




100%|██████████| 50/50 [03:30<00:00,  4.20s/trial, best loss: -0.7837412865947291]
Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.15442360012496834}


In [75]:
# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(
    random_state=random_state, 
    penalty=trials.best_trial['result']['params']['penalty'],
    solver=trials.best_trial['result']['params']['solver'],
    C=float(trials.best_trial['result']['params']['C'])
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.78


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Подбор гиперпараметров с помощью Hyperopt в модели логистической регрессии дал такие же результаты(0,78), что и ранее.

#### **Случайный лес**

In [76]:
space={'n_estimators': hp.quniform('n_estimators', 80, 200, 1),
       'max_depth' : hp.quniform('max_depth', 15, 40, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
      }
random_state = 42
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])
              }
  
    # используем эту комбинацию для построения модели
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    # обучаем модель
    model.fit(X, y)
    score = metrics.f1_score(y, model.predict(X))
    
    # кросс-валидация
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score
# начинаем подбор гиперпараметров

trials = Trials()

best=fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=50, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [05:34<00:00,  6.70s/trial, best loss: -0.81382291398683]  
Наилучшие значения гиперпараметров {'max_depth': 23.0, 'min_samples_leaf': 2.0, 'n_estimators': 196.0}


In [77]:
# рассчитаем метрику для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.82


Подбор гиперпараметров с помощью Hyperopt в модели случайного леса дал такие же результаты(0,82), что и с подбором по сетке.

## <center> Optuna

#### **Логистическая регрессия**

In [70]:
random_state = 42
def optuna_lr(trial):
  # задаем пространства поиска гиперпараметров
  penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
  solver = trial.suggest_categorical('solver', ['lbfgs', 'sag']) 
  C = trial.suggest_float('C',0.01,1)

  # создаем модель
  model = linear_model.LogisticRegression(penalty=penalty,
                                          solver=solver,
                                          C=C,
                                          random_state=random_state
  )
  # обучаем модель
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_test, model.predict(X_test))

  return score

# cоздаем объект исследования
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lr, n_trials=20)


# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(**study.best_params,random_state=random_state)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

[I 2023-08-11 14:11:11,631] A new study created in memory with name: LogisticRegression
[I 2023-08-11 14:11:15,344] Trial 0 finished with value: 0.7788461538461539 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.7465116026018345}. Best is trial 0 with value: 0.7788461538461539.
[I 2023-08-11 14:11:18,992] Trial 1 finished with value: 0.7788461538461539 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.1431410528263954}. Best is trial 0 with value: 0.7788461538461539.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2023-08-11 14:11:19,469] Trial 2 finished with value: 0.7545344619105199 and parameters: {'penalty': 'none', 'solver': 'lbfgs', 

f1_score на тестовом наборе: 0.80


Подбор гиперпараметров с помощью Optuna в модели логистической регрессии дал наилучшее значение метрики f1 - 0.80

#### **Случайный лес**

In [71]:
random_state = 42
def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
  max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

  # создаем модель
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state
  )
  # обучаем модель
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_test, model.predict(X_test))

  return score
# cоздаем объект исследования
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params,random_state=random_state)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

[I 2023-08-11 14:11:37,357] A new study created in memory with name: RandomForestClassifier
[I 2023-08-11 14:11:41,967] Trial 0 finished with value: 0.8271752085816448 and parameters: {'n_estimators': 193, 'max_depth': 21, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8271752085816448.
[I 2023-08-11 14:11:45,648] Trial 1 finished with value: 0.824228028503563 and parameters: {'n_estimators': 196, 'max_depth': 22, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8271752085816448.
[I 2023-08-11 14:11:48,219] Trial 2 finished with value: 0.8152562574493444 and parameters: {'n_estimators': 170, 'max_depth': 30, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8271752085816448.
[I 2023-08-11 14:11:51,639] Trial 3 finished with value: 0.8275862068965516 and parameters: {'n_estimators': 182, 'max_depth': 30, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.8275862068965516.
[I 2023-08-11 14:11:54,270] Trial 4 finished with value: 0.8254716981132076 and parameters: {'n_

f1_score на тестовом наборе: 0.83


Подбор гиперпараметров с помощью Optuna в модели случайного леса дал наилучшее значение метрики f1 - 0.83