# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import confusion_matrix
from joblib import dump

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../dataset/day-of-week-not-scaled.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1682,6,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1683,7,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1684,8,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

Здесь параметр stratify=y используется для того, чтобы сохранить пропорцию классов в целевой переменной при разбиении данных.

In [4]:
XX = X_train.copy()
yy = y_train.copy()
X_train, X_valid, y_train, y_valid = train_test_split(XX, yy, test_size=0.2, random_state=21, stratify=yy)

После этого нужно дополнительно разделить тренировочный набор на две части: новую тренировочную часть и валидационную часть. Это делается для оценки модели на данных, которые модель еще не видела после обучения на основной тренировочной выборке.

В этом случае test_size=0.2 означает, что 20% от исходного тренировочного набора будет использовано для валидации.

Valid (валидация) – это процесс проверки качества модели на промежуточных этапах обучения. Валидационная выборка (X_valid) используется для контроля за тем, насколько хорошо модель обобщает данные, которых она ранее не видела. Это помогает избежать переобучения и недоучивания модели.

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [5]:
svc = SVC(C=10, gamma='auto', probability=True, random_state=21, kernel='rbf')
svc.fit(X_train, y_train)
svm_preds = svc.predict(X_valid)

print("SVM:")
print(f"accuracy is {accuracy_score(y_valid, svm_preds):.5f}")
print(f"precision is {precision_score(y_valid, svm_preds, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, svm_preds, average='weighted'):.5f}\n")

SVM:
accuracy is 0.87778
precision is 0.88162
recall is 0.87778



In [6]:
dtc = DecisionTreeClassifier(max_depth=22, class_weight='balanced', random_state=21, criterion='gini')
dtc.fit(X_train, y_train)
dts_preds = dtc.predict(X_valid)

print("Tree:")
print(f"accuracy is {accuracy_score(y_valid, dts_preds):.5f}")
print(f"precision is {precision_score(y_valid, dts_preds, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, dts_preds, average='weighted'):.5f}\n")

Tree:
accuracy is 0.86667
precision is 0.86984
recall is 0.86667



In [7]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=28, random_state=21, criterion='gini')
rfc.fit(X_train, y_train)
rfc_preds = rfc.predict(X_valid)

print("RandomForestClassifier")
print(f"accuracy is {accuracy_score(y_valid, rfc_preds):.5f}")
print(f"precision is {precision_score(y_valid, rfc_preds, average='weighted'):.5f}")
print(f"recall is {recall_score(y_valid, rfc_preds, average='weighted'):.5f}\n")

RandomForestClassifier
accuracy is 0.89259
precision is 0.89361
recall is 0.89259



## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

VotingClassifier — это класс в библиотеке Scikit-learn, который реализует технику ансамблевого обучения, известную как голосование. Эта техника объединяет прогнозы нескольких базовых моделей для получения окончательного результата. Основное преимущество использования VotingClassifier заключается в том, что он может улучшить общую точность классификации путем комбинирования сильных сторон различных алгоритмов машинного обучения.

In [8]:
vc = VotingClassifier(estimators=[('svc', svc), ('tree', dtc), ('forest', rfc)], voting='hard', verbose=True)

In [9]:
vc.fit(X_train, y_train)
preds = vc.predict(X_valid)
print(f"Accuracy: {accuracy_score(y_valid, preds):.5f}")
print(f"Precision: {precision_score(y_valid, preds, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_valid, preds, average='weighted'):.5f}")

[Voting] ...................... (1 of 3) Processing svc, total=   0.4s
[Voting] ..................... (2 of 3) Processing tree, total=   0.0s
[Voting] ................... (3 of 3) Processing forest, total=   0.1s
Accuracy: 0.89630
Precision: 0.89605
Recall: 0.89630


In [10]:
weights = [(1, 2, 2), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
best_acc = 0
best_weights = None
best_prec = 0
models = [
    ('svm', svc),
    ('dt', dtc),
    ('rf', rfc)
]

for w in weights:
    voting_clf = VotingClassifier(estimators=models, voting='hard', weights=w)
    voting_clf.fit(X_train, y_train)
    
    preds = voting_clf.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    prec = precision_score(y_valid, preds, average='macro')
    
    if acc > best_acc or (acc == best_acc and prec > best_prec):
        best_acc = acc
        best_prec = prec
        best_weights = w

print(f"Best weights found: {best_weights} with Accuracy: {best_acc:.5f}, Precision: {best_prec:.5f}")

Best weights found: (1, 2, 2) with Accuracy: 0.90370, Precision: 0.90802


In [11]:
# Применяем лучшую модель на тестовом наборе
voting_clf = VotingClassifier(estimators=models, voting='hard', weights=best_weights)
voting_clf.fit(X_train, y_train)

# Прогнозируем на тестовом наборе
final_preds = voting_clf.predict(X_test)

# Оцениваем финальные метрики на тестовом наборе
final_acc = accuracy_score(y_test, final_preds)
final_prec = precision_score(y_test, final_preds, average='macro')
final_rec = recall_score(y_test, final_preds, average='macro')

print("Final Results on Test Set:")
print(f"Accuracy: {final_acc:.5f}, Precision: {final_prec:.5f}, Recall: {final_rec:.5f}")

Final Results on Test Set:
Accuracy: 0.91716, Precision: 0.91767, Recall: 0.90344


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [12]:
svc = SVC(C=10, gamma='auto', probability=True, random_state=21, kernel='rbf')
bc = BaggingClassifier(estimator=svc, n_estimators=10, random_state=21)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Accuracy: 0.86391
Precision: 0.86966
Recall: 0.86391


In [13]:
param_grid = {'n_estimators': [10, 30, 50],
             'warm_start': [True, False],
             'bootstrap': [True, False]}

gs = GridSearchCV(bc, param_grid, scoring='accuracy', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
print(f'лучшие параметры: {gs.best_params_}')
print(f'лучший score: {gs.best_score_}')
y_pred = gs.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
лучшие параметры: {'bootstrap': False, 'n_estimators': 30, 'warm_start': True}
лучший score: 0.8431955211024977
Accuracy: 0.88757
Precision: 0.88906
Recall: 0.88757


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [14]:
def evaluate_stacking_classifier(n_splits, passthrough):
    # Создаем генератор кросс-валидации
    cv_generator = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
    
    # Создаем стекинг-классификатор
    stacking_clf = StackingClassifier(
        estimators=models,
        final_estimator=LogisticRegression(solver='liblinear'),
        cv=cv_generator,
        passthrough=passthrough
    )
    
    # Обучаем классификатор на тренировочном наборе данных
    stacking_clf.fit(X_train, y_train)
    
    # Делаем предсказания на валидном наборе данных
    y_pred = stacking_clf.predict(X_valid)
    
    # Вычисляем метрики
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    recall = recall_score(y_valid, y_pred, average='macro')
    
    return accuracy, precision, recall

# Пробуем различные комбинации параметров
results = []
for n_splits in [2, 3, 4, 5, 6, 7]:
    for passthrough in [True, False]:
        accuracy, precision, recall = evaluate_stacking_classifier(n_splits, passthrough)
        results.append((n_splits, passthrough, accuracy, precision, recall))
        
# Находим наилучшие параметры по точности
best_result = max(results, key=lambda x: x[2])
print(f'Best params: n_splits={best_result[0]}, passthrough={best_result[1]}')
print(f'Accuracy: {best_result[2]:.4f}, Precision: {best_result[3]:.4f}, Recall: {best_result[4]:.4f}')

Best params: n_splits=5, passthrough=False
Accuracy: 0.9148, Precision: 0.9221, Recall: 0.9123


In [15]:
stacking_clf_best = StackingClassifier(
    estimators=models,
    final_estimator=LogisticRegression(solver='liblinear'),
    cv=StratifiedKFold(n_splits=best_result[0], shuffle=True, random_state=21),
    passthrough=best_result[1]
)

# Обучаем модель на объединенных тренировочных и валидных данных
stacking_clf_best.fit(pd.concat([X_train, X_valid]), pd.concat([y_train, y_valid]))

# Делаем предсказания на тестовом наборе данных
y_pred_test = stacking_clf_best.predict(X_test)

# Вычисляем метрики на тестовом наборе
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='macro')
test_recall = recall_score(y_test, y_pred_test, average='macro')

print(f'Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}')

Test Accuracy: 0.9320, Test Precision: 0.9335, Test Recall: 0.9148


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [16]:
y_pred = stacking_clf_best.predict(X_test)

# Получение матрицы неточностей
cm = confusion_matrix(y_test, y_pred)
cm

array([[21,  0,  1,  0,  1,  1,  3],
       [ 0, 51,  1,  2,  0,  1,  0],
       [ 0,  0, 28,  2,  0,  0,  0],
       [ 1,  0,  0, 77,  0,  1,  1],
       [ 0,  0,  0,  0, 19,  2,  0],
       [ 0,  0,  0,  2,  1, 50,  1],
       [ 0,  0,  0,  1,  0,  1, 69]], dtype=int64)

In [17]:
df_cm = pd.DataFrame(cm, index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
                     columns=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

# Рассчитываем долю ошибок для каждого класса
total_samples_per_class = cm.sum(axis=1)
error_rates = {}
for i, day in enumerate(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']):
    correct_predictions = cm[i][i]
    error_rate = (total_samples_per_class[i] - correct_predictions) / total_samples_per_class[i]
    error_rates[day] = error_rate * 100

# Сортировка дней по убыванию доли ошибок
sorted_error_rates = sorted(error_rates.items(), key=lambda x: -x[1])

print("Days with highest error rates:")
for day, rate in sorted_error_rates:
    print(f"{day}: {rate:.2f}%")

Days with highest error rates:
Mon: 22.22%
Fri: 9.52%
Sat: 7.41%
Tue: 7.27%
Wed: 6.67%
Thu: 3.75%
Sun: 2.82%


In [18]:
dump(stacking_clf_best, 'best_model.joblib')

['best_model.joblib']