# Композиции классификаторов

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import base
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, BaggingClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.datasets import make_classification, load_wine

## DummyEnsemble

Самый простой вариант ансамблирования это обучение нескольких моделей $b_t\bigr(x\bigr)$ и усреднение ответов:
$$
f\bigr(x\bigr) = \frac{1}{T}\sum_{t=1}^{T}b_t\bigr(x\bigr)
$$

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)

In [None]:
class DummyEnsemble(object):
    def __init__(self, base_estimator=None, n_estimators=10):
        self.n_estimators = n_estimators
        self.base_estimator = DecisionTreeClassifier(max_depth=1)
        if base_estimator:
            self.base_estimator = base_estimator
            
        self.b = [base.clone(self.base_estimator) for _ in range(self.n_estimators)]
        
    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 
                'base_estimator': self.base_estimator}
        
    def fit(self, X, Y):
        for b in self.b:
            b.fit(X, Y)
            
    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)
    
    def predict_proba(self, X):
        return np.mean([elem.predict_proba(X) for elem in self.b], axis=0)

In [None]:
model = DummyEnsemble()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = DummyEnsemble(LogisticRegression())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = DummyEnsemble(SVC(probability=True))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

## AdaBoost

Основная идея алгоритма состоит в том, чтобы итеративно дообучать модели на объектах, которые плохо классифицируются предыдущими моделями, при помощи их перевзвешивания.

$$
\sum_{i=1}^{l}\exp\bigr(-y_i\sum_{t=1}^{T}\alpha_tb_t\bigr(x_i\bigr)\bigr)\exp\bigr(-y_i\alpha_Tb_T\bigr(x_i\bigr)\bigr) \to \min_{\alpha_T, b_T}
$$

In [None]:
model = AdaBoostClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = AdaBoostClassifier(LogisticRegression())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = AdaBoostClassifier(SVC(probability=True))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

## Градиентный бустинг (XGBoost)

$$
\sum_{i=1}^{l}\mathcal{L}\left(\sum_{t=1}^{T-1}\alpha_tb_t\bigr(x_i\bigr)+\alpha b\bigr(x_i\bigr),y_i\right) \to \min_{\alpha, b}
$$

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)
model = GradientBoostingClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

## Bagging

In [None]:
model = BaggingClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = BaggingClassifier(SVC())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = BaggingClassifier(LogisticRegression())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

## Случайный лес

В данном случае мы говорим о множестве решающих деревьев (отсюда и слово лес).
1. Для задачи классификации в листях деревьев лежит класс.
2. Для задачи регресии в листях лежит среднее значение ответов для всех объектов.

Случайный лес является ансамблем деревьев:
1. Для каждого дерева выбирается произвольное подмножество объектов.
2. Строится каждое дерево не завиимо друг от друга.

In [None]:
model = RandomForestClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

## Подбор гиперпараметров

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import train_test_split

# Generate the dataset
X, y = make_hastie_10_2(random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the generated dataset
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Create the model (GradientBoostingClassifier)
gb_model = GradientBoostingClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

In [None]:
import pandas as pd

# Convert the cross-validation results to a Pandas DataFrame
cv_results_df = pd.DataFrame(grid_search.cv_results_)

# Filter the columns that contain relevant information
relevant_columns = [
    'param_learning_rate', 
    'param_max_depth', 
    'param_n_estimators', 
    'mean_test_score', 
    'std_test_score',
    'rank_test_score'
]

# Display the relevant information
cv_results_df = cv_results_df[relevant_columns].sort_values(by='rank_test_score').reset_index(drop=True)
cv_results_df.rename(columns={
    'param_learning_rate': 'learning_rate',
    'param_max_depth': 'max_depth',
    'param_n_estimators': 'n_estimators',
    'mean_test_score': 'mean_accuracy',
    'std_test_score': 'std_accuracy',
    'rank_test_score': 'rank'
}, inplace=True)

cv_results_df

In [None]:
from sklearn.metrics import accuracy_score

# Create the model with the best hyperparameters
best_gb_model = grid_search.best_estimator_ 

# Make predictions on the test data
y_pred = best_gb_model.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred)

test_accuracy