In [1]:
#импорт библиотек
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import linear_model #линейные моделиё
from sklearn import tree #деревья решений
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики
from sklearn.model_selection import train_test_split #сплитование выборки

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('data/_train_sem09.csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 1, test_size = 0.2)

# Логичиская регрессия

In [None]:
random_state = 42

In [5]:
#создадим и обучим логистическую регрессию без подбора гиперпараметров
log_reg = linear_model.LogisticRegression(random_state=random_state, max_iter=1000)
log_reg.fit(X_train, y_train)
y_test_pred = log_reg.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


## GridSearchCV

In [6]:
param_grid = [
              {'penalty': ['l2', 'none'] ,
              'solver': ['lbfgs', 'sag'],
               'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]},
              
              {'penalty': ['l1', 'l2'] ,
              'solver': ['liblinear', 'saga'],
               'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
]
grid_search_lg = GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=random_state, max_iter=8000), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search_lg.fit(X_train, y_train) 

y_test_pred = grid_search_lg.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f'Наилучшие значения гиперпараметров: {grid_search_lg.best_params_}')

CPU times: total: 2.03 s
Wall time: 1h 9min 56s
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


## RandomizedSearchCV

In [7]:
param_grid = {'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
               'C': list(np.linspace(0.01, 1, 10, dtype=float))},
            
random_search_lg = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=random_state, max_iter=8000), 
    param_distributions=param_grid, 
    cv=5, 
    n_iter = 30, 
    n_jobs = -1
)  
%time random_search_lg.fit(X_train, y_train) 
y_test_pred = random_search_lg.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f'Наилучшие значения гиперпараметров: {random_search_lg.best_params_}')

CPU times: total: 1.56 s
Wall time: 52min 13s
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.12}


## Hyperopt

In [8]:
space={'penalty': hp.choice('penalty', ['l2', 'none']),
       'solver' : hp.choice('solver', ['saga', 'sag', 'lbfgs'])
      }  

In [9]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {'penalty': params['penalty'], 
              'solver': params['solver']
              }
  
    model = linear_model.LogisticRegression(**params, random_state=random_state, max_iter=8000)

    #обучаем модель с помощью кросс-валидации
    #применяем cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    #минимизируем метрику
    return -score

In [19]:
#начинаем подбор гиперпараметров
trials = Trials()

best=fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=30,
          trials=trials,
          rstate=np.random.default_rng(random_state)
         )
print(f'Наилучшие значения гиперпараметров {best}')

100%|██████████| 30/30 [49:42<00:00, 99.42s/trial, best loss: -0.7700109914680013]  
Наилучшие значения гиперпараметров {'penalty': 0, 'solver': 1}


In [20]:
#рассчитаем f1_score для тестовой выборки

model = linear_model.LogisticRegression(penalty='l2', solver='sag', random_state=random_state, max_iter=1000)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print(f'f1_score на обучающем наборе: {metrics.f1_score(y_train, y_train_pred):.2f}')

y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на обучающем наборе: 0.89
f1_score на тестовом наборе: 0.78


## Optuna

In [21]:
def optuna_lr(trial):
  
  penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
  solver = trial.suggest_categorical('solver', ['lbfgs', 'sag'])
  C = trial.suggest_float('C', 0.01, 1)
  
  model = linear_model.LogisticRegression(penalty=penalty,
                                          solver=solver,
                                          C=C,
                                          random_state=random_state,
                                          max_iter=8000)

  #обучаем модель с помощью кросс-валидации
  #применяем cross validation с тем же количеством фолдов
  score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()

  return score

In [22]:
%%time
#cоздаем объект исследования
#указываем, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
#ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lr, n_trials=30)

[I 2023-08-08 15:57:43,209] A new study created in memory with name: LogisticRegression
[I 2023-08-08 15:57:46,442] Trial 0 finished with value: 0.7822848471830538 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.12551137788607353}. Best is trial 0 with value: 0.7822848471830538.
[I 2023-08-08 16:05:49,932] Trial 1 finished with value: 0.7380334294277232 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.2848167907086396}. Best is trial 0 with value: 0.7822848471830538.
[I 2023-08-08 16:05:55,765] Trial 2 finished with value: 0.7713904580293399 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.6806158183220073}. Best is trial 0 with value: 0.7822848471830538.
[I 2023-08-08 16:13:49,568] Trial 3 finished with value: 0.7380334294277232 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.7902304412863672}. Best is trial 0 with value: 0.7822848471830538.
[I 2023-08-08 16:13:55,654] Trial 4 finished with value: 0.7719428419613066 and parameters: {'penal

CPU times: total: 1.05 s
Wall time: 32min 27s


In [23]:
#выводим результаты на обучающей выборке
print(f'Наилучшие значения гиперпараметров {study.best_params}')
print(f'f1_score на обучающем наборе: {study.best_value:.2f}')

Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'sag', 'C': 0.13280793650076905}
f1_score на обучающем наборе: 0.78


In [24]:
#рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(**study.best_params, random_state=random_state, max_iter=1000)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.79


# Случайны лес

In [25]:

#Создаем и обучаем случайный лес без подбора гиперпараметров
rf = ensemble.RandomForestClassifier(random_state=random_state)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
print(f'Train: {metrics.f1_score(y_train, y_train_pred):.2f}')
y_test_pred = rf.predict(X_test)
print(f'Test: {metrics.f1_score(y_test, y_test_pred):.2f}')

Train: 1.00
Test: 0.81


## GridSearchCV

In [26]:
param_grid = {'n_estimators': list(range(100, 300, 30)),
              'min_samples_leaf': list(np.linspace(2, 10, 1, dtype=int)),
              'max_depth': list(np.linspace(20, 40, 5, dtype=int))
              }
            
grid_search_forest = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search_forest.fit(X_train, y_train) 
y_train_pred = grid_search_forest.predict(X_train)
print(f'f1_score на обучающем наборе: {metrics.f1_score(y_train, y_train_pred):.2f}')
y_test_pred = grid_search_forest.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f'Наилучшие значения гиперпараметров: {grid_search_forest.best_params_}')

CPU times: total: 6.06 s
Wall time: 2min 28s
f1_score на обучающем наборе: 0.99
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'max_depth': 25, 'min_samples_leaf': 2, 'n_estimators': 280}


## RandomizedSearchCV

In [27]:
param_grid = {'n_estimators': list(range(100, 300, 30)),
              'min_samples_leaf': list(np.linspace(2, 10, 1, dtype=int)),
              'max_depth': list(np.linspace(20, 40, 10, dtype=int))
              }
            
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_distributions=param_grid, 
    cv=5,
    n_iter = 20, 
    n_jobs = -1
)  
%time random_search_forest.fit(X_train, y_train) 
y_train_pred = random_search_forest.predict(X_train)
print(f'f1_score на обучающем наборе: {metrics.f1_score(y_train, y_train_pred):.2f}')
y_test_pred = random_search_forest.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f'Наилучшие значения гиперпараметров: {random_search_forest.best_params_}')

CPU times: total: 5.67 s
Wall time: 1min 24s
f1_score на обучающем наборе: 0.99
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'n_estimators': 280, 'min_samples_leaf': 2, 'max_depth': 28}


## Hyperopt

In [28]:
space={'n_estimators': hp.quniform('n_estimators', 100, 300, 30),
       'max_depth' : hp.quniform('max_depth', 20, 40, 10),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
      }

In [29]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])
              }
  
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    
    #обучаем модель с помощью кросс-валидации
    #применяем cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [30]:
%%time
#подбор гиперпараметров

trials = Trials()

best=fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=20,
          trials=trials,
          rstate=np.random.default_rng(random_state)
         )
print(f'Наилучшие значения гиперпараметров {best}')

100%|██████████| 20/20 [01:26<00:00,  4.30s/trial, best loss: -0.8105395273906474]
Наилучшие значения гиперпараметров {'max_depth': 30.0, 'min_samples_leaf': 3.0, 'n_estimators': 240.0}
CPU times: total: 531 ms
Wall time: 1min 26s


In [31]:

#рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print(f'f1_score на обучающем наборе: {metrics.f1_score(y_train, y_train_pred):.2f}')
y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на обучающем наборе: 0.98
f1_score на тестовом наборе: 0.83


## Optuna

In [32]:
def optuna_rf(trial):
  
  n_estimators = trial.suggest_int('n_estimators', 100, 310, 30)
  max_depth = trial.suggest_int('max_depth', 20, 40, 10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

  
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=42)
  #обучаем модель с помощью кросс-валидации
  #применяем cross validation с тем же количеством фолдов
  score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()

  return score

In [33]:
%%time
#cоздаем объект исследования
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
#находим лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

[I 2023-08-08 16:35:55,509] A new study created in memory with name: RandomForestClassifier
[I 2023-08-08 16:35:59,488] Trial 0 finished with value: 0.8052954120143164 and parameters: {'n_estimators': 160, 'max_depth': 30, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8052954120143164.
[I 2023-08-08 16:36:05,766] Trial 1 finished with value: 0.8017452286706709 and parameters: {'n_estimators': 280, 'max_depth': 20, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8052954120143164.
[I 2023-08-08 16:36:09,487] Trial 2 finished with value: 0.7988903752121587 and parameters: {'n_estimators': 160, 'max_depth': 20, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8052954120143164.
[I 2023-08-08 16:36:14,328] Trial 3 finished with value: 0.8074122568066919 and parameters: {'n_estimators': 190, 'max_depth': 20, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.8074122568066919.
[I 2023-08-08 16:36:19,455] Trial 4 finished with value: 0.8024807470657234 and parameters: {'n_

CPU times: total: 844 ms
Wall time: 1min 50s


In [34]:
#выводим результаты на обучающей выборке
print(f'Наилучшие значения гиперпараметров {study.best_params}')
print(f'f1_score на обучающем наборе: {study.best_value:.2f}')

Наилучшие значения гиперпараметров {'n_estimators': 280, 'max_depth': 40, 'min_samples_leaf': 2}
f1_score на обучающем наборе: 0.81


In [35]:
#рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params,random_state=random_state)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print(f'f1_score на обучающем наборе: {metrics.f1_score(y_train, y_train_pred):.2f}')
y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на обучающем наборе: 0.99
f1_score на тестовом наборе: 0.82
