In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDRegressor # Регрессия
from sklearn.linear_model import LogisticRegression # Классификация
from sklearn.tree import DecisionTreeClassifier # Классификация при помощи решающего дерева

from sklearn.metrics import accuracy_score # Метрика accuracy
from sklearn.metrics import mean_absolute_error # Метрика для регрессии
from sklearn.metrics import f1_score # Метрика f1 score

from sklearn.datasets import load_iris # Датасет для классификации и решающих деревьев
from sklearn.datasets import load_diabetes # Датасет для регрессии

from sklearn.model_selection import train_test_split # Разбиение на выборки

from sklearn.model_selection import GridSearchCV # Сеточный метод поиска параметров
from sklearn.model_selection import KFold # Разбиение на фолды

# Загрузка тестовых датасетов

In [None]:
X_class, Y_class = load_iris(as_frame=True)['data'], load_iris(as_frame=True)['target']

display(X_class)
display(Y_class)

X_reg, Y_reg = load_diabetes(as_frame=True)['data'], load_diabetes(as_frame=True)['target']

display(X_reg)
display(Y_reg)

# Разбиение на выборки

In [None]:
# Классификация
X_train_class, X_test_class, Y_train_class, Y_test_class \
    = train_test_split(X_class, Y_class, test_size = 0.2, random_state=13)
X_train_class, X_val_class, Y_train_class, Y_val_class \
    = train_test_split(X_train_class, Y_train_class, test_size = 0.25, random_state=13)

# Регрессия
X_train_reg, X_test_reg, Y_train_reg, Y_test_reg \
    = train_test_split(X_reg, Y_reg, test_size = 0.2, random_state=13)
X_train_reg, X_val_reg, Y_train_reg, Y_val_reg \
    = train_test_split(X_train_reg, Y_train_reg, test_size = 0.25, random_state=13)

# Просмотр документации

In [None]:
print(DecisionTreeClassifier.__doc__)

A decision tree classifier.

    Read more in the :ref:`User Guide <tree>`.

    Parameters
    ----------
    criterion : {"gini", "entropy"}, default="gini"
        The function to measure the quality of a split. Supported criteria are
        "gini" for the Gini impurity and "entropy" for the information gain.

    splitter : {"best", "random"}, default="best"
        The strategy used to choose the split at each node. Supported
        strategies are "best" to choose the best split and "random" to choose
        the best random split.

    max_depth : int, default=None
        The maximum depth of the tree. If None, then nodes are expanded until
        all leaves are pure or until all leaves contain less than
        min_samples_split samples.

    min_samples_split : int or float, default=2
        The minimum number of samples required to split an internal node:

        - If int, then consider `min_samples_split` as the minimum number.
        - If float, then `min_samples_spli

# Обучение

In [None]:
model_class = LogisticRegression()
model_class.fit(X_train_class, Y_train_class)

model_reg = SGDRegressor()
model_reg.fit(X_train_reg, Y_train_reg)

model_tree = DecisionTreeClassifier(max_depth=20)
model_tree.fit(X_train_class, Y_train_class)




DecisionTreeClassifier(max_depth=20)

# Предсказание

In [None]:
pred_test_class = model_class.predict(X_test_class)
pred_test_reg = model_reg.predict(X_test_reg)
pred_test_tree = model_tree.predict(X_test_class)

# Расчет метрик

In [None]:
print(accuracy_score(Y_test_class, pred_test_class))
print(mean_absolute_error(Y_test_reg, pred_test_reg))
print(f1_score(Y_test_class, pred_test_tree, average='weighted'))

0.9333333333333333
46.96620469986649
1.0


# Подбор параметров моделей

In [None]:
# start_time = time.time()

# with ignore_warnings(category=ConvergenceWarning):
#     model = make_pipeline(PolynomialFeatures(), StandardScaler(with_mean=False), DecisionTreeRegressor(random_state=43))

#     list_param = {'polynomialfeatures__degree': [i for i in range(1, 2)],
#                   'decisiontreeregressor__max_depth': [3,4,5,None],
#                   'decisiontreeregressor__max_features':  ['auto', None, 'log2'],
#                   'decisiontreeregressor__min_samples_leaf': range(1, 4),
#                   'decisiontreeregressor__min_samples_split': range(2, 5),
#                   'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse']}

#     gr_search = GridSearchCV(model, list_param , scoring='neg_mean_absolute_error',
#                              cv=KFold(n_splits=5, shuffle=True, random_state=13), verbose=4).fit(inputs_train,
#         (labels_train - norm_labels[0]) / norm_labels[1])
#     print("Best parameters\n")
#     for j in gr_search.best_params_:
#         print(f"{j}: {gr_search.best_params_[j]}")
#     print("\n")
#     model = gr_search.best_estimator_
#     model.fit(inputs_train,
#         (labels_train - norm_labels[0]) / norm_labels[1])
#     pred_train = model.predict(inputs_train)
#     metric_train = mean_absolute_error(labels_train, pred_train * norm_labels[1] + norm_labels[0])

#     print(f"MAE for train data: {metric_train:.5f}")
#     print(f"Time for train: {time.time() - start_time:.2f} seconds")

In [None]:
list_param = {'max_depth': [2, 10, 20],
                'max_features': ['auto', 'log2', None],
                'min_samples_leaf': range(1,4),
                'min_samples_split': range(2, 5),
                'criterion': ['gini', 'entropy']}

model_tree = DecisionTreeClassifier()

gr_search = GridSearchCV(model_tree, list_param,
                         cv=KFold(n_splits=5, shuffle=True, random_state=13), verbose=4)

gr_search.fit(X_train_class, Y_train_class)

print("\nBest parameters\n")
for j in gr_search.best_params_:
    print(f"{j}: {gr_search.best_params_[j]}")
print("\n")
model_tree = gr_search.best_estimator_
model_tree.fit(X_train_class, Y_train_class)

pred_train_tree = model_tree.predict(X_train_class)
pred_val_tree = model_tree.predict(X_val_class)
pred_test_tree = model_tree.predict(X_test_class)


print(f1_score(Y_train_class, pred_train_tree, average='weighted'))
print(f1_score(Y_val_class, pred_val_tree, average='weighted'))
print(f1_score(Y_test_class, pred_test_tree, average='weighted'))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.778 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.833 total time=   0.0s

[CV 3/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2, max_features=None, m

[CV 4/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.778 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.889 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.889 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=10, max_features

[CV 4/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.889 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=20, max_features

[CV 3/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.778 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=20, max_features

[CV 5/5] END criterion=entropy, max_depth=2, max_features=auto, min_samples_leaf=3, min_samples_split=4;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth

[CV 5/5] END criterion=entropy, max_depth=2, max_features=None, min_samples_leaf=3, min_samples_split=4;, score=0.889 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.833 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max

[CV 5/5] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=3, min_samples_split=4;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, ma

[CV 1/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.889 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2;, score=0.944 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=0.944 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=3;, score=1.000 total time=   0.0s
[CV 3/5] END criterion=entropy, ma

[CV 5/5] END criterion=entropy, max_depth=20, max_features=None, min_samples_leaf=3, min_samples_split=4;, score=0.944 total time=   0.0s

Best parameters

criterion: entropy
max_depth: 2
max_features: log2
min_samples_leaf: 1
min_samples_split: 3


0.9551899725812768
0.9657687991021324
0.9013729977116705


# Практическое задание *

Применить make_pipeline, PolynomialFeatures, StandardScaler из sklearn