# Изучение процесса подбора гиперпараметров модели

## Импорт и подготовка окружения

In [1]:
import time
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("./Task/LabML_5.1/diabetes.csv")
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Shapes:", X_train.shape, X_test.shape)

Shapes: (614, 8) (154, 8)


## Базовая модель

In [3]:
rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)
t0 = time.time()
rf_base.fit(X_train, y_train)
base_train_time = time.time() - t0

y_pred_base = rf_base.predict(X_test)
base_f1 = f1_score(y_test, y_pred_base, zero_division=0)

print(f"Base RF: F1={base_f1:.4f}, Train time={base_train_time:.3f}s")

Base RF: F1=0.6465, Train time=0.082s


## Randomized Search (Scikit-Learn) — настройка и запуск

In [4]:
from scipy.stats import randint

param_dist = {
    "n_estimators": randint(50, 501),        # 50..500
    "max_depth": randint(2, 21),             # 2..20
    "max_features": ["sqrt", "log2"] + list(range(1, X.shape[1]+1)),
    "min_samples_split": randint(2, 11),     # 2..10
    "min_samples_leaf": randint(1, 6),       # 1..5
    "bootstrap": [True, False]
}

rf_for_search = RandomForestClassifier(random_state=42, n_jobs=-1)

n_iter_search = 55   # число случайных комбинаций
cv = 5               # число фолдов

rand_search = RandomizedSearchCV(
    estimator=rf_for_search,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    scoring="f1",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

t0 = time.time()
rand_search.fit(X_train, y_train)
rand_time = time.time() - t0

# Результаты
best_params_rand = rand_search.best_params_
best_score_rand_cv = rand_search.best_score_
print("Random Search done: time = {:.2f}s, best CV F1 = {:.4f}".format(rand_time, best_score_rand_cv))
print("Best params (Random Search):", best_params_rand)

Fitting 5 folds for each of 55 candidates, totalling 275 fits
Random Search done: time = 14.95s, best CV F1 = 0.6484
Best params (Random Search): {'bootstrap': True, 'max_depth': 16, 'max_features': 6, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 152}


## Оценка лучшей модели из Random Search на тесте

In [5]:
best_rf_rand = rand_search.best_estimator_
t0 = time.time()
y_pred_rand = best_rf_rand.predict(X_test)
test_time_rand = time.time() - t0

metrics_rand_test = {
    "F1": f1_score(y_test, y_pred_rand, zero_division=0),
    "Accuracy": accuracy_score(y_test, y_pred_rand),
    "Precision": precision_score(y_test, y_pred_rand, zero_division=0),
    "Recall": recall_score(y_test, y_pred_rand, zero_division=0),
    "Predict_time_s": test_time_rand
}
metrics_rand_test

{'F1': 0.6336633663366337,
 'Accuracy': 0.7597402597402597,
 'Precision': 0.6808510638297872,
 'Recall': 0.5925925925925926,
 'Predict_time_s': 0.01634502410888672}

## Hyperopt TPE — определение пространства и objective

In [6]:
space = {
    "n_estimators": hp.quniform("n_estimators", 50, 500, 1),
    "max_depth": hp.quniform("max_depth", 2, 20, 1),
    "max_features_choice": hp.choice("max_features_choice", ["sqrt", "log2", "auto", "none", "int"]),
    "max_features_int": hp.quniform("max_features_int", 1, X.shape[1], 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
    "min_samples_leaf": hp.quniform("min_samples_leaf", 1, 5, 1),
    "bootstrap": hp.choice("bootstrap", [True, False])
}

def hp_objective(params):
    # преобразования типов
    n_estimators = int(params["n_estimators"])
    max_depth = int(params["max_depth"])
    min_samples_split = int(params["min_samples_split"])
    min_samples_leaf = int(params["min_samples_leaf"])
    bootstrap = params["bootstrap"]
    # max_features handling
    choice = params["max_features_choice"]
    if choice == "int":
        max_features = int(params["max_features_int"])
    elif choice == "none" or choice == "auto":
        max_features = None
    else:
        max_features = choice

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1
    )

    # cross-val (F1)
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="f1", n_jobs=-1)
    mean_cv_f1 = np.mean(cv_scores)

    # hyperopt минимизирует "loss" -> используем -mean_f1
    return {"loss": -mean_cv_f1, "status": STATUS_OK, "cv_f1": mean_cv_f1}


## Запуск Hyperopt (TPE) и замер времени

In [7]:
trials = Trials()
max_evals = 50  # количество интерпретаций

t0 = time.time()
best_hyperopt = fmin(
    fn=hp_objective,
    space=space,
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials,
    rstate=np.random.default_rng(42)
)
hyperopt_time = time.time() - t0

print("Hyperopt done: time = {:.2f}s".format(hyperopt_time))
print("Best raw result (Hyperopt):", best_hyperopt)

100%|██████████| 50/50 [00:14<00:00,  3.41trial/s, best loss: -0.650744354418545] 
Hyperopt done: time = 14.67s
Best raw result (Hyperopt): {'bootstrap': np.int64(0), 'max_depth': np.float64(8.0), 'max_features_choice': np.int64(4), 'max_features_int': np.float64(4.0), 'min_samples_leaf': np.float64(4.0), 'min_samples_split': np.float64(7.0), 'n_estimators': np.float64(210.0)}


## Преобразование результатов Hyperopt в читаемые параметры

In [8]:
def parse_hyperopt_result(best):
    n_estimators = int(best["n_estimators"])
    max_depth = int(best["max_depth"])
    min_samples_split = int(best["min_samples_split"])
    min_samples_leaf = int(best["min_samples_leaf"])
    bootstrap = [True, False][best["bootstrap"]]
    choice_idx = best["max_features_choice"]
    choice_list = ["sqrt", "log2", "auto", "none", "int"]
    choice = choice_list[choice_idx]
    if choice == "int":
        max_features = int(best["max_features_int"])
    elif choice == "none" or choice == "auto":
        max_features = None
    else:
        max_features = choice

    return {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "bootstrap": bootstrap,
        "max_features": max_features
    }

best_hyperopt_parsed = parse_hyperopt_result(best_hyperopt)
best_trial_idx = np.argmin([t["result"]["loss"] for t in trials.trials])
best_trial = trials.trials[best_trial_idx]
best_hyperopt_cv_f1 = best_trial["result"]["cv_f1"]

print("Parsed best Hyperopt params:", best_hyperopt_parsed)
print(f"Best Hyperopt CV F1 = {best_hyperopt_cv_f1:.4f}")

Parsed best Hyperopt params: {'n_estimators': 210, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 4}
Best Hyperopt CV F1 = 0.6507


## Оценка лучшей Hyperopt-модели на тесте

In [9]:
best_hp = best_hyperopt_parsed
clf_hp = RandomForestClassifier(
    n_estimators=best_hp["n_estimators"],
    max_depth=best_hp["max_depth"],
    max_features=best_hp["max_features"],
    min_samples_split=best_hp["min_samples_split"],
    min_samples_leaf=best_hp["min_samples_leaf"],
    bootstrap=best_hp["bootstrap"],
    random_state=42,
    n_jobs=-1
)
t0 = time.time()
clf_hp.fit(X_train, y_train)
train_time_hp = time.time() - t0

y_pred_hp = clf_hp.predict(X_test)
metrics_hp_test = {
    "F1": f1_score(y_test, y_pred_hp, zero_division=0),
    "Accuracy": accuracy_score(y_test, y_pred_hp),
    "Precision": precision_score(y_test, y_pred_hp, zero_division=0),
    "Recall": recall_score(y_test, y_pred_hp, zero_division=0),
    "Train_time_s": train_time_hp
}
metrics_hp_test

{'F1': 0.6464646464646465,
 'Accuracy': 0.7727272727272727,
 'Precision': 0.7111111111111111,
 'Recall': 0.5925925925925926,
 'Train_time_s': 0.1719801425933838}

## Сравнительный анализ результатов

In [10]:
res = {
    "method": ["base_rf", "random_search", "hyperopt"],
    "cv_f1_or_test_f1": [
        base_f1,                        # базовая модель (test F1)
        best_score_rand_cv,             # RandomSearch CV F1 (mean CV)
        best_hyperopt_cv_f1             # Hyperopt CV F1 (mean CV)
    ],
    "test_F1": [
        base_f1,
        metrics_rand_test["F1"],
        metrics_hp_test["F1"]
    ],
    "train_time_s": [
        base_train_time,
        rand_time,
        hyperopt_time
    ],
    "test_accuracy": [
        accuracy_score(y_test, y_pred_base),
        metrics_rand_test["Accuracy"],
        metrics_hp_test["Accuracy"]
    ]
}
df_compare = pd.DataFrame(res)
df_compare

Unnamed: 0,method,cv_f1_or_test_f1,test_F1,train_time_s,test_accuracy
0,base_rf,0.646465,0.646465,0.082493,0.772727
1,random_search,0.6484,0.633663,14.949423,0.75974
2,hyperopt,0.650744,0.646465,14.665665,0.772727


In [14]:
summary = (
    f"1) RandomizedSearch: лучшая CV F1 = {best_score_rand_cv:.4f}, время подбора = {rand_time:.2f} s, "
    f"параметры = {best_params_rand}. "
    f"Тест F1 = {metrics_rand_test['F1']:.4f}.\n\n"
    
    f"2) Hyperopt (TPE): лучшая CV F1 = {best_hyperopt_cv_f1:.4f}, время подбора = {hyperopt_time:.2f} s, "
    f"параметры = {best_hyperopt_parsed}. "
    f"Тест F1 = {metrics_hp_test['F1']:.4f}.\n\n"
    
    f"3) Сравнение: RandomSearch vs Hyperopt — сравниваем скорость, лучшие CV F1, лучшие тестовые метрики "
    f"и различия в гиперпараметрах. Hyperopt часто эффективнее при том же бюджете, но требует аккуратного "
    f"определения пространства параметров; RandomSearch проще и воспроизводим.\n\n"
    
    f"4) Рекомендация: использовать Hyperopt/TPE для целенаправленной оптимизации при ограниченном бюджете, "
    f"а RandomizedSearch — для простоты и стабильности."
)
print(summary)

1) RandomizedSearch: лучшая CV F1 = 0.6484, время подбора = 14.95 s, параметры = {'bootstrap': True, 'max_depth': 16, 'max_features': 6, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 152}. Тест F1 = 0.6337.

2) Hyperopt (TPE): лучшая CV F1 = 0.6507, время подбора = 14.67 s, параметры = {'n_estimators': 210, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 4}. Тест F1 = 0.6465.

3) Сравнение: RandomSearch vs Hyperopt — сравниваем скорость, лучшие CV F1, лучшие тестовые метрики и различия в гиперпараметрах. Hyperopt часто эффективнее при том же бюджете, но требует аккуратного определения пространства параметров; RandomSearch проще и воспроизводим.

4) Рекомендация: использовать Hyperopt/TPE для целенаправленной оптимизации при ограниченном бюджете, а RandomizedSearch — для простоты и стабильности.
