# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [35]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [8]:
def calc_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print(f"accuracy is {accuracy:.5f}")
    print(f"precision is {precision:.5f}")
    print(f"recall is {recall:.5f}")

In [9]:
%%time
svm_best_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}

svm_model = SVC(**svm_best_params)
svm_model.fit(X_train, y_train)

calc_metrics(svm_model, X_valid, y_valid)

accuracy is 0.87870
precision is 0.88001
recall is 0.87870
CPU times: user 868 ms, sys: 0 ns, total: 868 ms
Wall time: 857 ms


In [10]:
%%time
tree_best_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}

dt_model = DecisionTreeClassifier(**tree_best_params)

dt_model.fit(X_train, y_train)

calc_metrics(dt_model, X_valid, y_valid)

accuracy is 0.86296
precision is 0.86657
recall is 0.86296
CPU times: user 68.6 ms, sys: 22.8 ms, total: 91.4 ms
Wall time: 85.5 ms


In [11]:
%%time
rf_best_params={'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}

rf = RandomForestClassifier(**rf_best_params)
rf.fit(X_train, y_train)

calc_metrics(rf, X_valid, y_valid)

accuracy is 0.89259
precision is 0.89360
recall is 0.89259
CPU times: user 420 ms, sys: 25.2 ms, total: 446 ms
Wall time: 441 ms


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [12]:
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt_model), ('svm', svm_model)], voting='soft')

voting_clf.fit(X_train, y_train)

In [13]:
calc_metrics(voting_clf, X_valid, y_valid)

accuracy is 0.87407
precision is 0.87687
recall is 0.87407


In [14]:
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt_model), ('svm', svm_model)], voting='hard')

voting_clf.fit(X_train, y_train)
calc_metrics(voting_clf, X_valid, y_valid)

accuracy is 0.91111
precision is 0.91116
recall is 0.91111


In [21]:
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt_model), ('svm', svm_model)], 
                              voting='soft', 
                              weights=[4,1,4])

voting_clf.fit(X_train, y_train)
calc_metrics(voting_clf, X_valid, y_valid)

accuracy is 0.90741
precision is 0.90919
recall is 0.90741


In [22]:
calc_metrics(voting_clf, X_test, y_test)

accuracy is 0.90533
precision is 0.90943
recall is 0.90533


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [24]:
best_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}

svm_model = SVC(**best_params)

bagging_clf = BaggingClassifier(estimator=svm_model, n_estimators=10, random_state=21)

bagging_clf.fit(X_train, y_train)

In [25]:
calc_metrics(bagging_clf, X_valid, y_valid)

accuracy is 0.88519
precision is 0.89427
recall is 0.88519


In [28]:
%%time

param_grid = {'n_estimators': [10, 30, 50]}

gs = GridSearchCV(bagging_clf, param_grid, scoring='accuracy')
gs.fit(X_train, y_train)
gs.best_params_

CPU times: user 2min 25s, sys: 49.7 ms, total: 2min 25s
Wall time: 2min 25s


{'n_estimators': 50}

In [32]:
best_params = {'n_estimators': 50}
calc_metrics(BaggingClassifier(**best_params).fit(X_train, y_train), 
             X_test, 
             y_test)


accuracy is 0.88462
precision is 0.88779
recall is 0.88462


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [33]:
n_splits_values = [2, 3, 4, 5, 6, 7]

skf = StratifiedKFold(n_splits=n_splits_values[0], shuffle=True, random_state=21)

In [36]:
stacking_clf = StackingClassifier(estimators=[('rf', rf), ('dt', dt_model), ('svm', svm_model)], final_estimator=LogisticRegression(solver='liblinear'), cv=skf, passthrough=True)

stacking_clf.fit(X_train, y_train)

In [37]:
calc_metrics(stacking_clf, X_valid, y_valid)

accuracy is 0.89630
precision is 0.89766
recall is 0.89630


In [38]:
calc_metrics(stacking_clf, X_test, y_test)

accuracy is 0.89941
precision is 0.90467
recall is 0.89941


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

BEST Voting Classifier

In [40]:
all = pd.DataFrame(y_test.value_counts()).sort_values(by='dayofweek')
err = pd.DataFrame(y_test[y_test != voting_clf.predict(X_test)].value_counts()).sort_values(by='dayofweek')
(err/all*100).sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
dayofweek,Unnamed: 1_level_1
0,29.62963
5,11.111111
4,9.52381
1,9.090909
6,8.450704
2,6.666667
3,3.75


In [41]:
pd.DataFrame(X_test[y_test != voting_clf.predict(X_test)])[['numTrials']].value_counts()

numTrials
1            7
5            5
2            4
8            3
7            2
3            1
6            1
11           1
15           1
16           1
19           1
29           1
30           1
31           1
54           1
84           1
Name: count, dtype: int64

In [42]:
filename = 'voting_model.pkl'
joblib.dump(voting_clf, filename)

['voting_model.pkl']