# Day 09. Exercise 03
# Ensembles

### Запуск контейнера с нужными версиями

docker run -d \
  --platform linux/amd64 \
  -p 8888:8888 \
  -v $(pwd):/home/jovyan/work \
  --name sklearn \
  jupyter/scipy-notebook:python-3.8 \
  bash -c "pip install scikit-learn==0.23.1 tqdm==4.46.1 && start-notebook.sh --NotebookApp.token=''"

#### и выбираем правильный kernel в vscode на localhost (который отдает докер)

In [193]:
import sys
print("Python версия:", sys.version)

import sklearn
print("scikit-learn версия:", sklearn.__version__)

import pandas as pd
print("pandas версия:", pd.__version__)

import numpy as np
print("numpy версия:", np.__version__)

import tqdm
print("tqdm версия:", tqdm.__version__)

Python версия: 3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:10) 
[GCC 10.3.0]
scikit-learn версия: 0.23.1
pandas версия: 1.5.0
numpy версия: 1.23.3
tqdm версия: 4.64.1


## 0. Imports

In [194]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
import joblib
import json
import itertools

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [195]:
df = pd.read_csv('work/src/data/day-of-week-not-scaled.csv')
print(f"Размерность данных: {df.shape}")
print(f"Колонки в файле: {df.columns.tolist()}")

Размерность данных: (1686, 43)
Колонки в файле: ['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1']


In [196]:
df_target = pd.read_csv('work/src/data/dayofweek.csv')

df['dayofweek'] = df_target['dayofweek']

print(f"Новая размерность: {df.shape}")
print("Финальные колонки:", df.columns.tolist())

df.head()

Новая размерность: (1686, 44)
Финальные колонки: ['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1', 'dayofweek']


Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [197]:
print("Распределение целевой переменной:")
df['dayofweek'].value_counts().sort_index()

Распределение целевой переменной:


0    136
1    274
2    149
3    396
4    104
5    271
6    356
Name: dayofweek, dtype: int64

In [198]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=21, 
    stratify=y
)

print("Первое разделение:")
print("Размерность тренировочного набора:", X_train.shape)
print("Размерность тестового набора:", X_test.shape)

Первое разделение:
Размерность тренировочного набора: (1348, 43)
Размерность тестового набора: (338, 43)


In [199]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, 
    test_size=0.2,
    random_state=21, 
    stratify=y_train
)

print("Второе разделение:")
print("Финальная размерность тренировочного набора:", X_train.shape)
print("Размерность валидационного набора:", X_valid.shape)
print("Размерность тестового набора:", X_test.shape)

Второе разделение:
Финальная размерность тренировочного набора: (1078, 43)
Размерность валидационного набора: (270, 43)
Размерность тестового набора: (338, 43)


In [200]:
print("Распределение в тренировочной выборке:")
print(y_train.value_counts().sort_index())
print("Распределение в валидационной выборке:")
print(y_valid.value_counts().sort_index())
print("Распределение в тестовой выборке:")
print(y_test.value_counts().sort_index())

Распределение в тренировочной выборке:
0     87
1    175
2     95
3    253
4     66
5    174
6    228
Name: dayofweek, dtype: int64
Распределение в валидационной выборке:
0    22
1    44
2    24
3    63
4    17
5    43
6    57
Name: dayofweek, dtype: int64
Распределение в тестовой выборке:
0    27
1    55
2    30
3    80
4    21
5    54
6    71
Name: dayofweek, dtype: int64


## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [201]:
print("=== Обучение индивидуальных классификаторов с лучшими параметрами из ex01 ===\n")

svm_ex01_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', "probability": True, "random_state": 21}
svm_best = SVC(**svm_ex01_params)

dt_best = DecisionTreeClassifier(
    max_depth=21,
    criterion='gini',
    class_weight='balanced',
    random_state=21
)

rf_best = RandomForestClassifier(
    n_estimators=100,
    max_depth=24,
    criterion='entropy',
    class_weight='balanced',
    random_state=21
)

=== Обучение индивидуальных классификаторов с лучшими параметрами из ex01 ===



In [202]:
%%time

print("Обучение SVM...")
svm_best.fit(X_train, y_train)

svm_pred = svm_best.predict(X_valid)
svm_accuracy = accuracy_score(y_valid, svm_pred)
svm_precision = precision_score(y_valid, svm_pred, average='weighted')
svm_recall = recall_score(y_valid, svm_pred, average='weighted')

print(f"accuracy is {svm_accuracy:.5f}")
print(f"precision is {svm_precision:.5f}")
print(f"recall is {svm_recall:.5f}")

Обучение SVM...
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
CPU times: user 862 ms, sys: 6.36 ms, total: 868 ms
Wall time: 775 ms


In [203]:
%%time

print("Обучение Decision Tree...")
dt_best.fit(X_train, y_train)

dt_pred = dt_best.predict(X_valid)
dt_accuracy = accuracy_score(y_valid, dt_pred)
dt_precision = precision_score(y_valid, dt_pred, average='weighted')
dt_recall = recall_score(y_valid, dt_pred, average='weighted')

print(f"accuracy is {dt_accuracy:.5f}")
print(f"precision is {dt_precision:.5f}")
print(f"recall is {dt_recall:.5f}")

Обучение Decision Tree...
accuracy is 0.86667
precision is 0.87170
recall is 0.86667
CPU times: user 32.9 ms, sys: 9.05 ms, total: 42 ms
Wall time: 28 ms


In [204]:
%%time

print("Обучение Random Forest...")
rf_best.fit(X_train, y_train)

rf_pred = rf_best.predict(X_valid)
rf_accuracy = accuracy_score(y_valid, rf_pred)
rf_precision = precision_score(y_valid, rf_pred, average='weighted')
rf_recall = recall_score(y_valid, rf_pred, average='weighted')

print(f"accuracy is {rf_accuracy:.5f}")
print(f"precision is {rf_precision:.5f}")
print(f"recall is {rf_recall:.5f}")

Обучение Random Forest...
accuracy is 0.89630
precision is 0.89698
recall is 0.89630
CPU times: user 248 ms, sys: 5.02 ms, total: 253 ms
Wall time: 251 ms


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [205]:
%%time

voting_default = VotingClassifier(
    estimators=[
        ('svm', svm_best),
        ('dt', dt_best),
        ('rf', rf_best)
    ],
    voting='hard'
)

print("Обучение стандартного Voting Classifier...")
voting_default.fit(X_train, y_train)

voting_default_pred = voting_default.predict(X_valid)
voting_default_accuracy = accuracy_score(y_valid, voting_default_pred)
voting_default_precision = precision_score(y_valid, voting_default_pred, average='weighted')
voting_default_recall = recall_score(y_valid, voting_default_pred, average='weighted')

print(f"accuracy is {voting_default_accuracy:.5f}")
print(f"precision is {voting_default_precision:.5f}")
print(f"recall is {voting_default_recall:.5f}")

Обучение стандартного Voting Classifier...
accuracy is 0.90000
precision is 0.89993
recall is 0.90000
CPU times: user 956 ms, sys: 7.09 ms, total: 963 ms
Wall time: 960 ms


In [206]:
%%time

print("=== Эксперименты с различными параметрами голосования ===\n")

voting_experiments = []

voting_types = ['hard', 'soft']
weight_range = range(0, 6)

experiments = []

for voting_type in voting_types:
    for w1, w2, w3 in itertools.product(weight_range, repeat=3):
        if w1 == 0 and w2 == 0 and w3 == 0:
            continue
        
        if (w1 + w2 + w3) <= 15:
            experiments.append({
                'voting': voting_type, 
                'weights': [w1, w2, w3], 
                'name': f'{voting_type}_w{w1}{w2}{w3}'
            })

print(f"Всего экспериментов для запуска: {len(experiments)}")

for i, exp in enumerate(experiments):
    if i % 50 == 0:
        print(f"Прогресс: {i+1}/{len(experiments)}")
    
    voting_clf = VotingClassifier(
        estimators=[
            ('svm', svm_best),
            ('dt', dt_best),
            ('rf', rf_best)
        ],
        voting=exp['voting'],
        weights=exp['weights']
    )
    
    voting_clf.fit(X_train, y_train)
    pred = voting_clf.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='weighted')
    recall = recall_score(y_valid, pred, average='weighted')
    
    voting_experiments.append({
        'name': exp['name'],
        'voting': exp['voting'],
        'weights': exp['weights'],
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'model': voting_clf
    })

top_10 = sorted(voting_experiments, key=lambda x: (x['accuracy'], x['precision']), reverse=True)[:10]
print("\nТоп-10 лучших комбинаций голосования:")
for i, result in enumerate(top_10, 1):
    print(f"{i}. {result['name']}: accuracy={result['accuracy']:.5f}, precision={result['precision']:.5f}, weights={result['weights']}")

=== Эксперименты с различными параметрами голосования ===

Всего экспериментов для запуска: 430
Прогресс: 1/430
Прогресс: 51/430
Прогресс: 101/430
Прогресс: 151/430
Прогресс: 201/430
Прогресс: 251/430
Прогресс: 301/430
Прогресс: 351/430
Прогресс: 401/430

Топ-10 лучших комбинаций голосования:
1. soft_w102: accuracy=0.92593, precision=0.92943, weights=[1, 0, 2]
2. soft_w204: accuracy=0.92593, precision=0.92943, weights=[2, 0, 4]
3. soft_w305: accuracy=0.91481, precision=0.91748, weights=[3, 0, 5]
4. soft_w414: accuracy=0.91111, precision=0.91288, weights=[4, 1, 4]
5. soft_w103: accuracy=0.91111, precision=0.91155, weights=[1, 0, 3]
6. soft_w205: accuracy=0.91111, precision=0.91155, weights=[2, 0, 5]
7. soft_w415: accuracy=0.91111, precision=0.91144, weights=[4, 1, 5]
8. soft_w511: accuracy=0.90741, precision=0.91149, weights=[5, 1, 1]
9. soft_w512: accuracy=0.90741, precision=0.91149, weights=[5, 1, 2]
10. soft_w413: accuracy=0.90741, precision=0.91099, weights=[4, 1, 3]
CPU times: user

In [219]:
%%time

print("=== А теперь для чеклиста без нулей ===\n")

voting_experiments = []

voting_types = ['hard', 'soft']
weight_range = range(1, 5)

experiments = []

for voting_type in voting_types:
    for w1, w2, w3 in itertools.product(weight_range, repeat=3):
        if w1 == 0 and w2 == 0 and w3 == 0:
            continue
        
        experiments.append({
            'voting': voting_type, 
            'weights': [w1, w2, w3], 
            'name': f'{voting_type}_w{w1}{w2}{w3}'
        })

print(f"Всего экспериментов для запуска: {len(experiments)}")

for i, exp in enumerate(experiments):
    if i % 50 == 0:
        print(f"Прогресс: {i+1}/{len(experiments)}")
    
    voting_clf = VotingClassifier(
        estimators=[
            ('svm', svm_best),
            ('dt', dt_best),
            ('rf', rf_best)
        ],
        voting=exp['voting'],
        weights=exp['weights']
    )
    
    voting_clf.fit(X_train, y_train)
    pred = voting_clf.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='weighted')
    recall = recall_score(y_valid, pred, average='weighted')
    
    voting_experiments.append({
        'name': exp['name'],
        'voting': exp['voting'],
        'weights': exp['weights'],
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'model': voting_clf
    })

print(f"Завершены все {len(experiments)} экспериментов!")

top_10 = sorted(voting_experiments, key=lambda x: (x['accuracy'], x['precision']), reverse=True)[:10]
print("\nТоп-10 лучших комбинаций голосования:")
for i, result in enumerate(top_10, 1):
    print(f"{i}. {result['name']}: accuracy={result['accuracy']:.5f}, precision={result['precision']:.5f}, weights={result['weights']}")

=== А теперь для чеклиста без нулей ===

Всего экспериментов для запуска: 128
Прогресс: 1/128
Прогресс: 51/128
Прогресс: 101/128
Завершены все 128 экспериментов!

Топ-10 лучших комбинаций голосования:
1. soft_w414: accuracy=0.91111, precision=0.91288, weights=[4, 1, 4]
2. soft_w413: accuracy=0.90741, precision=0.91099, weights=[4, 1, 3]
3. soft_w411: accuracy=0.90741, precision=0.91026, weights=[4, 1, 1]
4. soft_w412: accuracy=0.90741, precision=0.91026, weights=[4, 1, 2]
5. soft_w324: accuracy=0.90741, precision=0.91012, weights=[3, 2, 4]
6. hard_w232: accuracy=0.90741, precision=0.90773, weights=[2, 3, 2]
7. hard_w243: accuracy=0.90741, precision=0.90773, weights=[2, 4, 3]
8. hard_w342: accuracy=0.90741, precision=0.90773, weights=[3, 4, 2]
9. hard_w343: accuracy=0.90741, precision=0.90773, weights=[3, 4, 3]
10. soft_w322: accuracy=0.90370, precision=0.90610, weights=[3, 2, 2]
CPU times: user 1min 46s, sys: 207 ms, total: 1min 46s
Wall time: 1min 46s


In [208]:
voting_experiments.append({
    'name': 'hard_equal',
    'accuracy': voting_default_accuracy,
    'precision': voting_default_precision,
    'recall': voting_default_recall,
    'model': voting_default
})

best_voting = max(voting_experiments, key=lambda x: (x['accuracy'], x['precision']))
print(f"Лучший voting classifier: {best_voting['name']}")
print(f"Точность на валидации: {best_voting['accuracy']:.5f}")
print(f"Precision на валидации: {best_voting['precision']:.5f}")

best_voting_model = best_voting['model']
voting_test_pred = best_voting_model.predict(X_test)
voting_test_accuracy = accuracy_score(y_test, voting_test_pred)
voting_test_precision = precision_score(y_test, voting_test_pred, average='weighted')
voting_test_recall = recall_score(y_test, voting_test_pred, average='weighted')

print(f"\nЛучший Voting Classifier - результаты на тестовом наборе:")
print(f"accuracy is {voting_test_accuracy:.5f}")
print(f"precision is {voting_test_precision:.5f}")
print(f"recall is {voting_test_recall:.5f}")

Лучший voting classifier: soft_w414
Точность на валидации: 0.91111
Precision на валидации: 0.91288

Лучший Voting Classifier - результаты на тестовом наборе:
accuracy is 0.90533
precision is 0.90881
recall is 0.90533


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [209]:
%%time

print("=== Шаг 1: Различные n_estimators ===")

n_estimators_values = [5, 10, 25, 50]
bagging_results = []

for n_est in n_estimators_values:
    print(f"\nТестируем n_estimators={n_est}")
    
    bagging_svm = BaggingClassifier(
        base_estimator=svm_best,
        n_estimators=n_est,
        bootstrap_features=True,
        random_state=21
    )
    
    bagging_svm.fit(X_train, y_train)
    pred = bagging_svm.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='weighted')
    recall = recall_score(y_valid, pred, average='weighted')
    
    bagging_results.append({
        'n_estimators': n_est,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'model': bagging_svm
    })
    
    print(f"accuracy is {accuracy:.5f}")
    print(f"precision is {precision:.5f}")
    print(f"recall is {recall:.5f}")

best_n_est_result = max(bagging_results, key=lambda x: (x['accuracy'], x['precision']))
print(f"\nЛучший n_estimators: {best_n_est_result['n_estimators']}")

=== Шаг 1: Различные n_estimators ===

Тестируем n_estimators=5
accuracy is 0.70370
precision is 0.73089
recall is 0.70370

Тестируем n_estimators=10
accuracy is 0.77778
precision is 0.81225
recall is 0.77778

Тестируем n_estimators=25
accuracy is 0.82222
precision is 0.84354
recall is 0.82222

Тестируем n_estimators=50
accuracy is 0.83333
precision is 0.85141
recall is 0.83333

Лучший n_estimators: 50
CPU times: user 21.4 s, sys: 11.6 ms, total: 21.4 s
Wall time: 21.4 s


In [210]:
%%time

print("=== Шаг 2: Другие параметры ===\n")

best_n_est = best_n_est_result['n_estimators']
print(f"Используем лучший n_estimators={best_n_est}")

other_params = [
    {'max_samples': 0.5, 'max_features': 1.0},
    {'max_samples': 0.7, 'max_features': 1.0},
    {'max_samples': 0.8, 'max_features': 1.0},
    {'max_samples': 1.0, 'max_features': 0.5},
    {'max_samples': 1.0, 'max_features': 0.7},
    {'max_samples': 1.0, 'max_features': 0.8},
    {'max_samples': 0.8, 'max_features': 0.8},
]

for params in other_params:
    print(f"\nТестируем max_samples={params['max_samples']}, max_features={params['max_features']}")
    
    bagging_svm = BaggingClassifier(
        base_estimator=svm_best,
        n_estimators=50,
        max_samples=params['max_samples'],
        max_features=params['max_features'],
        random_state=21
    )
    
    bagging_svm.fit(X_train, y_train)
    pred = bagging_svm.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='weighted')
    recall = recall_score(y_valid, pred, average='weighted')
    
    bagging_results.append({
        'n_estimators': best_n_est,
        'max_samples': params['max_samples'],
        'max_features': params['max_features'],
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'model': bagging_svm
    })
    
    print(f"accuracy is {accuracy:.5f}")
    print(f"precision is {precision:.5f}")
    print(f"recall is {recall:.5f}")

=== Шаг 2: Другие параметры ===

Используем лучший n_estimators=50

Тестируем max_samples=0.5, max_features=1.0
accuracy is 0.83333
precision is 0.84955
recall is 0.83333

Тестируем max_samples=0.7, max_features=1.0
accuracy is 0.85556
precision is 0.86899
recall is 0.85556

Тестируем max_samples=0.8, max_features=1.0
accuracy is 0.87037
precision is 0.88092
recall is 0.87037

Тестируем max_samples=1.0, max_features=0.5
accuracy is 0.74815
precision is 0.78443
recall is 0.74815

Тестируем max_samples=1.0, max_features=0.7
accuracy is 0.84444
precision is 0.85762
recall is 0.84444

Тестируем max_samples=1.0, max_features=0.8
accuracy is 0.87037
precision is 0.87699
recall is 0.87037

Тестируем max_samples=0.8, max_features=0.8
accuracy is 0.85926
precision is 0.86804
recall is 0.85926
CPU times: user 1min 3s, sys: 40.1 ms, total: 1min 3s
Wall time: 1min 3s


In [211]:
final_best_bagging = max(bagging_results, key=lambda x: (x['accuracy'], x['precision']))

print(f"Лучшие параметры bagging:")
for key, value in final_best_bagging.items():
    if key != 'model':
        print(f"{key}: {value}")

best_bagging_model = final_best_bagging['model']
test_pred = best_bagging_model.predict(X_test)

bagging_test_accuracy = accuracy_score(y_test, test_pred)
bagging_test_precision = precision_score(y_test, test_pred, average='weighted')
bagging_test_recall = recall_score(y_test, test_pred, average='weighted')

print(f"\nЛучший Bagging - результаты на тестовом наборе:")
print(f"accuracy is {bagging_test_accuracy:.5f}")
print(f"precision is {bagging_test_precision:.5f}")
print(f"recall is {bagging_test_recall:.5f}")

Лучшие параметры bagging:
n_estimators: 50
max_samples: 0.8
max_features: 1.0
accuracy: 0.8703703703703703
precision: 0.8809178986483951
recall: 0.8703703703703703

Лучший Bagging - результаты на тестовом наборе:
accuracy is 0.85503
precision is 0.86157
recall is 0.85503


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [212]:
%%time

print("=== Эксперименты с Stacking Classifier ===\n")

n_splits_values = [2, 3, 4, 5, 6, 7]
passthrough_values = [True, False]
stacking_experiments = []

for n_splits in n_splits_values:
    for passthrough in passthrough_values:
        print(f"\nТестируем n_splits={n_splits}, passthrough={passthrough}")
        
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        
        stacking_clf = StackingClassifier(
            estimators=[
                ('svm', svm_best),
                ('dt', dt_best),
                ('rf', rf_best)
            ],
            final_estimator=LogisticRegression(solver='liblinear'),
            cv=cv,
            passthrough=passthrough
        )
        
        stacking_clf.fit(X_train, y_train)
        pred = stacking_clf.predict(X_valid)
        
        accuracy = accuracy_score(y_valid, pred)
        precision = precision_score(y_valid, pred, average='weighted')
        recall = recall_score(y_valid, pred, average='weighted')
        
        stacking_experiments.append({
            'n_splits': n_splits,
            'passthrough': passthrough,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'model': stacking_clf
        })
        
        print(f"accuracy is {accuracy:.5f}")
        print(f"precision is {precision:.5f}")
        print(f"recall is {recall:.5f}")

=== Эксперименты с Stacking Classifier ===


Тестируем n_splits=2, passthrough=True
accuracy is 0.90370
precision is 0.90619
recall is 0.90370

Тестируем n_splits=2, passthrough=False
accuracy is 0.89630
precision is 0.89678
recall is 0.89630

Тестируем n_splits=3, passthrough=True
accuracy is 0.90370
precision is 0.90632
recall is 0.90370

Тестируем n_splits=3, passthrough=False
accuracy is 0.89630
precision is 0.89759
recall is 0.89630

Тестируем n_splits=4, passthrough=True
accuracy is 0.91111
precision is 0.91327
recall is 0.91111

Тестируем n_splits=4, passthrough=False
accuracy is 0.90370
precision is 0.90570
recall is 0.90370

Тестируем n_splits=5, passthrough=True
accuracy is 0.90000
precision is 0.90217
recall is 0.90000

Тестируем n_splits=5, passthrough=False
accuracy is 0.90000
precision is 0.90056
recall is 0.90000

Тестируем n_splits=6, passthrough=True
accuracy is 0.90370
precision is 0.90450
recall is 0.90370

Тестируем n_splits=6, passthrough=False
accuracy is 0.90370


In [213]:
best_stacking = max(stacking_experiments, key=lambda x: (x['accuracy'], x['precision']))
print(f"Лучшие параметры stacking classifier:")
print(f"n_splits: {best_stacking['n_splits']}")
print(f"passthrough: {best_stacking['passthrough']}")
print(f"Точность на валидации: {best_stacking['accuracy']:.5f}")
print(f"Precision на валидации: {best_stacking['precision']:.5f}")

best_stacking_model = best_stacking['model']
stacking_test_pred = best_stacking_model.predict(X_test)
stacking_test_accuracy = accuracy_score(y_test, stacking_test_pred)
stacking_test_precision = precision_score(y_test, stacking_test_pred, average='weighted')
stacking_test_recall = recall_score(y_test, stacking_test_pred, average='weighted')

print(f"\nЛучший Stacking Classifier - результаты на тестовом наборе:")
print(f"accuracy is {stacking_test_accuracy:.5f}")
print(f"precision is {stacking_test_precision:.5f}")
print(f"recall is {stacking_test_recall:.5f}")

Лучшие параметры stacking classifier:
n_splits: 4
passthrough: True
Точность на валидации: 0.91111
Precision на валидации: 0.91327

Лучший Stacking Classifier - результаты на тестовом наборе:
accuracy is 0.90533
precision is 0.90844
recall is 0.90533


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [214]:
print("=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===\n")

all_models = {
    'SVM': {'accuracy': svm_accuracy, 'precision': svm_precision, 'model': svm_best},
    'Decision Tree': {'accuracy': dt_accuracy, 'precision': dt_precision, 'model': dt_best},
    'Random Forest': {'accuracy': rf_accuracy, 'precision': rf_precision, 'model': rf_best},
    'Best Voting': {'accuracy': voting_test_accuracy, 'precision': voting_test_precision, 'model': best_voting_model},
    'Best Bagging': {'accuracy': bagging_test_accuracy, 'precision': bagging_test_precision, 'model': best_bagging_model},
    'Best Stacking': {'accuracy': stacking_test_accuracy, 'precision': stacking_test_precision, 'model': best_stacking_model}
}

print("Производительность всех моделей на тестовом наборе:")
for model_name, metrics in all_models.items():
    print(f"{model_name}: accuracy={metrics['accuracy']:.5f}, precision={metrics['precision']:.5f}")

best_model_name = max(all_models.keys(), key=lambda x: (all_models[x]['accuracy'], all_models[x]['precision']))
best_final_model = all_models[best_model_name]['model']
best_final_accuracy = all_models[best_model_name]['accuracy']
best_final_precision = all_models[best_model_name]['precision']

print(f"\nЛучшая модель: {best_model_name}")
print(f"Лучшая точность на тесте: {best_final_accuracy:.5f}")
print(f"Лучший precision на тесте: {best_final_precision:.5f}")

=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===

Производительность всех моделей на тестовом наборе:
SVM: accuracy=0.87778, precision=0.88162
Decision Tree: accuracy=0.86667, precision=0.87170
Random Forest: accuracy=0.89630, precision=0.89698
Best Voting: accuracy=0.90533, precision=0.90881
Best Bagging: accuracy=0.85503, precision=0.86157
Best Stacking: accuracy=0.90533, precision=0.90844

Лучшая модель: Best Voting
Лучшая точность на тесте: 0.90533
Лучший precision на тесте: 0.90881


In [215]:
print("=== Анализ ошибок ===\n")

full_predictions = best_final_model.predict(X)

cm_full = confusion_matrix(y, full_predictions)
print("Матрица ошибок (полный датасет):")
print(cm_full)

weekdays = ['Понедельник', 'Вторник', 'Среда', 'Четверг', 'Пятница', 'Суббота', 'Воскресенье']
weekday_errors = {}

print("\nПроцент ошибок по дням недели (% от общего количества образцов этого класса):")
for i, day in enumerate(weekdays):
    total_samples = sum(cm_full[i, :])
    correct_predictions = cm_full[i, i]
    if total_samples > 0:
        error_rate = (total_samples - correct_predictions) / total_samples * 100
        weekday_errors[day] = error_rate
        print(f"{day} (класс {i}): {error_rate:.2f}% ошибок ({total_samples - correct_predictions}/{total_samples} ошибок)")

worst_weekday = max(weekday_errors, key=weekday_errors.get) if weekday_errors else 'Нет'
worst_weekday_rate = weekday_errors[worst_weekday] if weekday_errors else 0
print(f"\nДень недели с наибольшими ошибками: {worst_weekday} ({worst_weekday_rate:.2f}% ошибок)")

=== Анализ ошибок ===

Матрица ошибок (полный датасет):
[[123   2   0   1   0   1   9]
 [  1 263   2   3   0   4   1]
 [  0   2 141   5   0   1   0]
 [  0   2   0 389   0   2   3]
 [  0   0   0   0 101   3   0]
 [  0   1   0   4   0 258   8]
 [  1   0   0   4   0   6 345]]

Процент ошибок по дням недели (% от общего количества образцов этого класса):
Понедельник (класс 0): 9.56% ошибок (13/136 ошибок)
Вторник (класс 1): 4.01% ошибок (11/274 ошибок)
Среда (класс 2): 5.37% ошибок (8/149 ошибок)
Четверг (класс 3): 1.77% ошибок (7/396 ошибок)
Пятница (класс 4): 2.88% ошибок (3/104 ошибок)
Суббота (класс 5): 4.80% ошибок (13/271 ошибок)
Воскресенье (класс 6): 3.09% ошибок (11/356 ошибок)

День недели с наибольшими ошибками: Понедельник (9.56% ошибок)


In [216]:
print("=== Анализ ошибок по Labname ===\n")

error_df = df.copy()
error_df['predicted'] = full_predictions
error_df['is_error'] = (error_df['dayofweek'] != error_df['predicted'])

labname_cols = [col for col in df.columns if col.startswith('labname_')]

labname_errors = {}
print("Процент ошибок по labname:")
for col in labname_cols:
    lab_name = col.replace('labname_', '')
    lab_samples = error_df[error_df[col] == 1.0]
    if len(lab_samples) > 0:
        error_count = lab_samples['is_error'].sum()
        error_rate = lab_samples['is_error'].mean() * 100
        labname_errors[lab_name] = error_rate
        print(f"{lab_name}: {error_rate:.2f}% ошибок ({error_count}/{len(lab_samples)} ошибок)")

if labname_errors:
    worst_lab = max(labname_errors, key=labname_errors.get)
    worst_lab_rate = labname_errors[worst_lab]
    print(f"\nLabname с наибольшими ошибками: {worst_lab} ({worst_lab_rate:.2f}% ошибок)")
else:
    worst_lab = 'Нет'
    worst_lab_rate = 0

=== Анализ ошибок по Labname ===

Процент ошибок по labname:
code_rvw: 6.10% ошибок (5/82 ошибок)
lab02: 0.00% ошибок (0/2 ошибок)
lab03: 100.00% ошибок (1/1 ошибок)
lab03s: 0.00% ошибок (0/1 ошибок)
lab05s: 11.11% ошибок (4/36 ошибок)
laba04: 5.62% ошибок (10/178 ошибок)
laba04s: 9.62% ошибок (10/104 ошибок)
laba05: 1.35% ошибок (3/222 ошибок)
laba06: 6.25% ошибок (3/48 ошибок)
laba06s: 6.56% ошибок (4/61 ошибок)
project1: 2.73% ошибок (26/951 ошибок)

Labname с наибольшими ошибками: lab03 (100.00% ошибок)


In [217]:
print("=== Анализ ошибок по пользователям ===\n")

uid_cols = [col for col in df.columns if col.startswith('uid_')]

user_errors = {}
for col in uid_cols:
    user_name = col.replace('uid_', '')
    user_samples = error_df[error_df[col] == 1.0]
    if len(user_samples) > 5:
        error_count = user_samples['is_error'].sum()
        error_rate = user_samples['is_error'].mean() * 100
        user_errors[user_name] = {
            'error_rate': error_rate, 
            'total_samples': len(user_samples), 
            'errors': error_count
        }

print("Топ-10 пользователей с наибольшим процентом ошибок:")
sorted_users = sorted(user_errors.items(), key=lambda x: x[1]['error_rate'], reverse=True)[:10]
for user, stats in sorted_users:
    print(f"user_{user}: {stats['error_rate']:.2f}% ошибок ({int(stats['errors'])}/{stats['total_samples']} ошибок)")

if sorted_users:
    worst_user = f"user_{sorted_users[0][0]}"
    worst_user_rate = sorted_users[0][1]['error_rate']
    print(f"\nПользователь с наибольшими ошибками: {worst_user} ({worst_user_rate:.2f}% ошибок)")
else:
    worst_user = 'Нет'
    worst_user_rate = 0

=== Анализ ошибок по пользователям ===

Топ-10 пользователей с наибольшим процентом ошибок:
user_user_22: 28.57% ошибок (2/7 ошибок)
user_user_6: 25.00% ошибок (3/12 ошибок)
user_user_17: 14.71% ошибок (5/34 ошибок)
user_user_16: 6.25% ошибок (2/32 ошибок)
user_user_15: 5.88% ошибок (1/17 ошибок)
user_user_2: 5.79% ошибок (7/121 ошибок)
user_user_18: 5.71% ошибок (2/35 ошибок)
user_user_30: 5.13% ошибок (2/39 ошибок)
user_user_25: 5.00% ошибок (6/120 ошибок)
user_user_29: 4.69% ошибок (3/64 ошибок)

Пользователь с наибольшими ошибками: user_user_22 (28.57% ошибок)


In [None]:
model_filename = 'work/src/ex03/model/best_model_ensembles.joblib'
joblib.dump(best_final_model, model_filename)
print(f"Лучшая ансамблевая модель сохранена как: {model_filename}")

model_info = {
    'best_model_type': best_model_name,
    'test_accuracy': float(best_final_accuracy),
    'test_precision': float(best_final_precision),
    'error_analysis': {
        'worst_weekday': worst_weekday,
        'worst_weekday_error_rate': float(worst_weekday_rate),
        'worst_labname': worst_lab,
        'worst_labname_error_rate': float(worst_lab_rate),
        'worst_user': worst_user,
        'worst_user_error_rate': float(worst_user_rate)
    },
    'all_models_performance': {
        name: {'accuracy': float(metrics['accuracy']), 'precision': float(metrics['precision'])} 
        for name, metrics in all_models.items()
    }
}

info_filename = 'work/src/ex03/model/model_info_ensembles.json'
with open(info_filename, 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"Информация о модели сохранена в файл: {info_filename}")

Лучшая ансамблевая модель сохранена как: work/src/ex03/model/best_model_ensembles.joblib
Информация о модели сохранена в: work/src/ex03/model/model_info_ensembles.json
