In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

## Задание 6.1

In [35]:
cv = KFold(n_splits=10, shuffle=True)

In [36]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    """Эта функция подсчитывает признаки для мета-классификатора.
    Они являются вероятностями классов при решении задачи многоклассовой классификации.
    
    :arg clf: классификатор
    :args X_train, y_train: обучающая выборка
    :arg X_test: признаки тестовой выборки
    :arg cv: класс, генерирующий фолды (KFold)
    
    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок
    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        X_meta_test += folded_clf.predict_proba(X_test)

    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)

    X_meta_test /= cv.n_splits

    return X_meta_train, X_meta_test

10

In [22]:
np.zeros((len(X_test), n_classes), dtype=np.float32)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [34]:
clf = GradientBoostingClassifier(n_estimators=300)
clf.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=300)

## Для следующих заданий использовать этот код:

In [38]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [43]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [44]:
def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

## Задание 6.2

In [48]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    GradientBoostingClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)

100%|██████████| 4/4 [02:29<00:00, 37.45s/it]


In [50]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs')

In [51]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.981539

## Задание 6.3

In [54]:
from sklearn.ensemble import ExtraTreesClassifier

In [55]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=200, n_jobs=-1),
], X_train, X_test, y_train, cv)

100%|██████████| 2/2 [00:14<00:00,  7.00s/it]


In [56]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.979795

## Задание 6.4

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
stacked_features_train, stacked_features_test = generate_meta_features([
    KNeighborsClassifier(n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
], X_train, X_test, y_train, cv)

100%|██████████| 2/2 [00:07<00:00,  3.52s/it]


In [59]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.990302

## Задание 6.5

In [60]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', multi_class='ovr', max_iter=2000),
    KNeighborsClassifier(n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
    AdaBoostClassifier(),
], X_train, X_test, y_train, cv)

100%|██████████| 4/4 [00:47<00:00, 11.75s/it]


In [61]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.987404

## Задание 6.6-6.7

In [78]:
cv = StratifiedKFold(n_splits=20, shuffle=True)

In [79]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [80]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
], X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:17<00:17, 17.69s/it][A
100%|██████████| 2/2 [00:30<00:00, 15.26s/it][A


In [81]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.982435

## Задание 6.8

In [82]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

In [83]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
], X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:05<00:05,  5.02s/it][A
100%|██████████| 2/2 [00:08<00:00,  4.38s/it][A


In [84]:
np.random.seed(42)
clf = RandomForestClassifier(n_jobs=-1)

In [85]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.981941

## Задание 6.9

In [86]:
np.random.seed(42)
clf = KNeighborsClassifier(n_jobs=-1)

In [87]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.98417

## Задание 6.10

In [110]:
np.random.seed(42)
clf = GradientBoostingClassifier()

In [111]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.987056

## Задание 6.11

In [112]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [113]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, max_depth=24, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1),
], X_train, X_test, y_train, cv)



  0%|          | 0/2 [00:00<?, ?it/s][A[A

 50%|█████     | 1/2 [00:03<00:03,  3.65s/it][A[A

100%|██████████| 2/2 [00:06<00:00,  3.23s/it][A[A


In [114]:
np.random.seed(42)
clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=100)

In [115]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.985029

## Задание 6.12

In [116]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    """Эта функция подсчитывает признаки для мета-классификатора.
    Они являются вероятностями классов при решении задачи многоклассовой классификации.
    
    :arg clf: классификатор
    :args X_train, y_train: обучающая выборка
    :arg X_test: признаки тестовой выборки
    :arg cv: класс, генерирующий фолды (KFold)
    
    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок
    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train, y_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        X_meta_test += folded_clf.predict_proba(X_test)

    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)

    X_meta_test /= cv.n_splits

    return X_meta_train, X_meta_test

In [117]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature_mean(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [136]:
f1_rfc = f1_score(y_test, y_test_pred_rfc, average='macro')

In [152]:
rfc.fit(X_train, y_train)
etc.fit(X_train, y_train)
lg.fit(X_train, y_train)

y_test_pred_rfc = rfc.predict_proba(X_test)[:,1]
y_test_pred_etc = etc.predict_proba(X_test)[:,1]
y_test_pred_lg = lg.predict_proba(X_test)[:,1]

In [125]:
rfc = RandomForestClassifier(n_estimators=300, max_depth=24, n_jobs=-1)
etc = ExtraTreesClassifier(n_estimators=300, n_jobs=-1)
lg = LogisticRegression(n_jobs=-1)

In [154]:
y = (y_test_pred_rfc + y_test_pred_etc + y_test_pred_lg) / 3

In [138]:
f1_lg = f1_score(y_test, y_test_pred_lg, average='macro')