# Day 09. Exercise 01
# Gridsearch

### Запуск контейнера с нужными версиями

docker run -d \
  --platform linux/amd64 \
  -p 8888:8888 \
  -v $(pwd):/home/jovyan/work \
  --name sklearn \
  jupyter/scipy-notebook:python-3.8 \
  bash -c "pip install scikit-learn==0.23.1 tqdm==4.46.1 && start-notebook.sh --NotebookApp.token=''"

#### и выбираем правильный kernel в vscode на localhost (который отдает докер)

In [44]:
import sys
print("Python версия:", sys.version)

import sklearn
print("scikit-learn версия:", sklearn.__version__)

import pandas as pd
print("pandas версия:", pd.__version__)

import numpy as np
print("numpy версия:", np.__version__)

import tqdm
print("tqdm версия:", tqdm.__version__)

Python версия: 3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:10) 
[GCC 10.3.0]
scikit-learn версия: 0.23.1
pandas версия: 1.5.0
numpy версия: 1.23.3
tqdm версия: 4.64.1


## 0. Imports

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.notebook import tqdm
import itertools

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [46]:
df = pd.read_csv('work/src/data/day-of-week-not-scaled.csv')
print(f"Размерность данных: {df.shape}")
print(f"Колонки в файле: {df.columns.tolist()}")

Размерность данных: (1686, 43)
Колонки в файле: ['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1']


In [47]:
df_target = pd.read_csv('work/src/data/dayofweek.csv')

df['dayofweek'] = df_target['dayofweek']

print(f"Новая размерность: {df.shape}")
print("Финальные колонки:", df.columns.tolist())

df.head()

Новая размерность: (1686, 44)
Финальные колонки: ['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1', 'dayofweek']


Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [48]:
print("Распределение целевой переменной:")
df['dayofweek'].value_counts().sort_index()

Распределение целевой переменной:


0    136
1    274
2    149
3    396
4    104
5    271
6    356
Name: dayofweek, dtype: int64

In [49]:
print(f"Общая информация о данных:")
print(f"- Количество образцов: {len(df)}")
print(f"- Количество признаков: {len(df.columns) - 1}")
print(f"- Количество классов: {df['dayofweek'].nunique()}")
print(f"- Классы: {sorted(df['dayofweek'].unique())}")

Общая информация о данных:
- Количество образцов: 1686
- Количество признаков: 43
- Количество классов: 7
- Классы: [0, 1, 2, 3, 4, 5, 6]


In [50]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=21, 
    stratify=y
)

print("Размерность тренировочного набора:", X_train.shape)
print("Размерность тестового набора:", X_test.shape)

Размерность тренировочного набора: (1348, 43)
Размерность тестового набора: (338, 43)


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [51]:
print("=== SVM GridSearch ===\n")

svm_param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

svm_model = SVC(random_state=21, probability=True)

svm_grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=svm_param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

print(f"Общее количество комбинаций: {len(svm_param_grid['kernel']) * len(svm_param_grid['C']) * len(svm_param_grid['gamma']) * len(svm_param_grid['class_weight'])}")

=== SVM GridSearch ===

Общее количество комбинаций: 72


In [52]:
%%time

svm_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  3.6min finished


CPU times: user 1.59 s, sys: 246 ms, total: 1.83 s
Wall time: 3min 35s


GridSearchCV(cv=5, estimator=SVC(probability=True, random_state=21), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             scoring='accuracy', verbose=1)

In [53]:
print(f"Лучшие параметры SVM: {svm_grid_search.best_params_}")
print(f"Лучший CV score: {svm_grid_search.best_score_}")

Лучшие параметры SVM: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Лучший CV score: 0.8761090458488228


In [54]:
svm_results_df = pd.DataFrame(svm_grid_search.cv_results_)
svm_results_df_sorted = svm_results_df.sort_values('rank_test_score')

print("Топ-10 результатов SVM GridSearch:")
top_svm_results = svm_results_df_sorted[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10)
for idx, row in top_svm_results.iterrows():
    row['params']['probability'] = True
    row['params']['random_state'] = 21
    print(f"Rank {int(row['rank_test_score'])}: {row['mean_test_score']:.5f} (±{row['std_test_score']:.5f}) - {row['params']}")

Топ-10 результатов SVM GridSearch:
Rank 1: 0.87611 (±0.01842) - {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Rank 2: 0.86350 (±0.01087) - {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Rank 3: 0.81602 (±0.00812) - {'C': 5, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Rank 4: 0.80861 (±0.02101) - {'C': 5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Rank 5: 0.72105 (±0.03444) - {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'linear', 'probability': True, 'random_state': 21}
Rank 5: 0.72105 (±0.03444) - {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 'random_state': 21}
Rank 7: 0.71959 (±0.01746) - {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 

In [55]:
top_5_scores = svm_results_df_sorted['mean_test_score'].head(5)
score_difference = top_5_scores.max() - top_5_scores.min()

print(f"Разница между лучшим и 5-м результатом: {score_difference:.5f}")

if score_difference < 0.01:
    print("Разница небольшая - можно выбрать более простую модель")
else:
    print("Разница существенная - стоит использовать лучшую модель")

Разница между лучшим и 5-м результатом: 0.15506
Разница существенная - стоит использовать лучшую модель


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [56]:
print("=== Decision Tree GridSearch ===\n")

dt_param_grid = {
    'max_depth': list(range(1, 50)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

dt_model = DecisionTreeClassifier(random_state=21)

dt_grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=dt_param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

print(f"Общее количество комбинаций: {len(dt_param_grid['max_depth']) * len(dt_param_grid['class_weight']) * len(dt_param_grid['criterion'])}")

=== Decision Tree GridSearch ===

Общее количество комбинаций: 196


In [57]:
%%time

dt_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.2s


CPU times: user 423 ms, sys: 82.9 ms, total: 506 ms
Wall time: 1.05 s


[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:    1.0s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=21), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...]},
             scoring='accuracy', verbose=1)

In [58]:
print(f"Лучшие параметры Decision Tree: {dt_grid_search.best_params_}")
print(f"Лучший CV score: {dt_grid_search.best_score_}")

Лучшие параметры Decision Tree: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}
Лучший CV score: 0.873864794162192


In [59]:
dt_results_df = pd.DataFrame(dt_grid_search.cv_results_)
dt_results_df_sorted = dt_results_df.sort_values('rank_test_score')

print("Топ-10 результатов Decision Tree GridSearch:")
top_dt_results = dt_results_df_sorted[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10)
for idx, row in top_dt_results.iterrows():
    row['params']['random_state'] = 21
    print(f"Rank {int(row['rank_test_score'])}: {row['mean_test_score']:.5f} (±{row['std_test_score']:.5f}) - {row['params']}")

Топ-10 результатов Decision Tree GridSearch:
Rank 1: 0.87386 (±0.02507) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Rank 2: 0.87385 (±0.02502) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 25, 'random_state': 21}
Rank 3: 0.87238 (±0.02526) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 49, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 27, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 28, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 29, 'random_state': 21}
Rank 4: 0.87237 (±0.02518) 

In [60]:
print("Анализ по глубине дерева:\n")
depth_analysis = dt_results_df.groupby(dt_results_df['param_max_depth'])['mean_test_score'].agg(['mean', 'max', 'std']).round(5)
print("Лучшие результаты по глубинам (топ-10):")
print(depth_analysis.sort_values('max', ascending=False).head(10))

top_5_scores = dt_results_df_sorted['mean_test_score'].head(5)
score_difference = top_5_scores.max() - top_5_scores.min()
print(f"\nРазница между лучшим и 5-м результатом: {score_difference:.5f}")
if score_difference < 0.01:
    print("Разница небольшая - можно выбрать более простую модель (меньшая глубина)")
else:
    print("Разница существенная - стоит использовать лучшую модель")

Анализ по глубине дерева:

Лучшие результаты по глубинам (топ-10):
                    mean      max      std
param_max_depth                           
21               0.86812  0.87386  0.00425
25               0.86645  0.87385  0.00580
22               0.86627  0.87238  0.00515
37               0.86627  0.87237  0.00462
29               0.86627  0.87237  0.00462
30               0.86627  0.87237  0.00462
31               0.86627  0.87237  0.00462
32               0.86627  0.87237  0.00462
33               0.86627  0.87237  0.00462
34               0.86627  0.87237  0.00462

Разница между лучшим и 5-м результатом: 0.00149
Разница небольшая - можно выбрать более простую модель (меньшая глубина)


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [61]:
print("=== Random Forest GridSearch ===\n")

rf_param_grid = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 49)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

rf_model = RandomForestClassifier(random_state=21)

rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

total_combinations = len(rf_param_grid['n_estimators']) * len(rf_param_grid['max_depth']) * len(rf_param_grid['class_weight']) * len(rf_param_grid['criterion'])
print(f"Общее количество комбинаций: {total_combinations}")

=== Random Forest GridSearch ===

Общее количество комбинаций: 768


In [62]:
%%time

rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1100 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 2200 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 3600 tasks      | elapsed:   34.1s


CPU times: user 3 s, sys: 193 ms, total: 3.19 s
Wall time: 36.8 s


[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed:   36.6s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=21), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_estimators': [5, 10, 50, 100]},
             scoring='accuracy', verbose=1)

In [63]:
print(f"Лучшие параметры Random Forest: {rf_grid_search.best_params_}")
print(f"Лучший CV score: {rf_grid_search.best_score_}")

Лучшие параметры Random Forest: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100}
Лучший CV score: 0.9042929918766351


In [64]:
rf_results_df = pd.DataFrame(rf_grid_search.cv_results_)
rf_results_df_sorted = rf_results_df.sort_values('rank_test_score')

print("Топ-10 результатов Random Forest GridSearch:")
top_rf_results = rf_results_df_sorted[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10)
for idx, row in top_rf_results.iterrows():
    row['params']['random_state'] = 21
    print(f"Rank {int(row['rank_test_score'])}: {row['mean_test_score']:.5f} (±{row['std_test_score']:.5f}) - {row['params']}")

Топ-10 результатов Random Forest GridSearch:
Rank 1: 0.90429 (±0.01236) - {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100, 'random_state': 21}
Rank 2: 0.90429 (±0.01096) - {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50, 'random_state': 21}
Rank 2: 0.90429 (±0.01216) - {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 29, 'n_estimators': 100, 'random_state': 21}
Rank 4: 0.90355 (±0.01206) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'n_estimators': 50, 'random_state': 21}
Rank 5: 0.90355 (±0.01438) - {'class_weight': None, 'criterion': 'gini', 'max_depth': 31, 'n_estimators': 100, 'random_state': 21}
Rank 6: 0.90281 (±0.01364) - {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 25, 'n_estimators': 100, 'random_state': 21}
Rank 7: 0.90281 (±0.01363) - {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 33, 'n_estimators': 50, 'random_state': 21}
Ra

In [65]:
print("Анализ по количеству деревьев:\n")
n_est_analysis = rf_results_df.groupby(rf_results_df['param_n_estimators'])['mean_test_score'].agg(['mean', 'max', 'std']).round(5)
print(n_est_analysis.sort_values('max', ascending=False))

top_5_scores = rf_results_df_sorted['mean_test_score'].head(5)
score_difference = top_5_scores.max() - top_5_scores.min()
print(f"\nРазница между лучшим и 5-м результатом: {score_difference:.5f}")
if score_difference < 0.01:
    print("Разница небольшая - можно выбрать более простую модель (меньше деревьев)")
else:
    print("Разница существенная - стоит использовать лучшую модель")

Анализ по количеству деревьев:

                       mean      max      std
param_n_estimators                           
50                  0.84266  0.90429  0.11907
100                 0.84466  0.90429  0.11936
10                  0.82603  0.89317  0.12446
5                   0.80737  0.88648  0.13683

Разница между лучшим и 5-м результатом: 0.00075
Разница небольшая - можно выбрать более простую модель (меньше деревьев)


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [66]:
print("=== Manual Random Forest GridSearch with Progress Bar ===\n")

manual_param_values = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 49)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

param_names = list(manual_param_values.keys())
param_combinations = list(itertools.product(*manual_param_values.values()))

print(f"Всего комбинаций параметров: {len(param_combinations)}")

=== Manual Random Forest GridSearch with Progress Bar ===

Всего комбинаций параметров: 768


In [67]:
%%time

manual_results = []

for params in tqdm(param_combinations, desc="Manual GridSearch Progress", unit="combination"):
    param_dict = dict(zip(param_names, params))
    
    rf_model = RandomForestClassifier(
        random_state=21,
        n_jobs=-1,
        **param_dict
    )
    
    scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    
    result = param_dict.copy()
    result['mean_accuracy'] = scores.mean()
    result['std_accuracy'] = scores.std()
    manual_results.append(result)

Manual GridSearch Progress:   0%|          | 0/768 [00:00<?, ?combination/s]

CPU times: user 8.5 s, sys: 1.15 s, total: 9.65 s
Wall time: 2min 13s


In [68]:
manual_results_df = pd.DataFrame(manual_results)
manual_results_df_sorted = manual_results_df.sort_values('mean_accuracy', ascending=False)

print("Топ-15 результатов Manual GridSearch:")
for idx, row in manual_results_df_sorted.head(15).iterrows():
    params_str = ", ".join([f"{k}={v}" for k, v in row.items() if k not in ['mean_accuracy', 'std_accuracy']])
    print(f"{row['mean_accuracy']:.5f} (±{row['std_accuracy']:.5f}) - {params_str}")

best_manual_score = manual_results_df_sorted['mean_accuracy'].iloc[0]
best_auto_score = rf_grid_search.best_score_

Топ-15 результатов Manual GridSearch:
0.90429 (±0.01236) - n_estimators=100, max_depth=24, class_weight=balanced, criterion=entropy
0.90429 (±0.01096) - n_estimators=50, max_depth=28, class_weight=None, criterion=gini
0.90429 (±0.01216) - n_estimators=100, max_depth=29, class_weight=balanced, criterion=entropy
0.90355 (±0.01206) - n_estimators=50, max_depth=30, class_weight=balanced, criterion=gini
0.90355 (±0.01438) - n_estimators=100, max_depth=31, class_weight=None, criterion=gini
0.90281 (±0.01364) - n_estimators=100, max_depth=25, class_weight=balanced, criterion=entropy
0.90281 (±0.01363) - n_estimators=50, max_depth=33, class_weight=balanced, criterion=gini
0.90281 (±0.01046) - n_estimators=100, max_depth=45, class_weight=None, criterion=gini
0.90281 (±0.01170) - n_estimators=50, max_depth=29, class_weight=None, criterion=gini
0.90281 (±0.01046) - n_estimators=100, max_depth=48, class_weight=None, criterion=gini
0.90281 (±0.01046) - n_estimators=100, max_depth=36, class_weight=N

In [69]:
print(f"=== Сравнение результатов ===\n")
print(f"Лучший результат Manual GridSearch: {best_manual_score:.5f}")
print(f"Лучший результат Auto GridSearch: {best_auto_score:.5f}")
print(f"Разница: {abs(best_manual_score - best_auto_score):.5f}")

top_5_manual_scores = manual_results_df_sorted['mean_accuracy'].head(5)
score_difference = top_5_manual_scores.max() - top_5_manual_scores.min()
print(f"\nРазница между лучшим и 5-м результатом (manual): {score_difference:.5f}")
if score_difference < 0.01:
    print("Разница небольшая - можно выбрать более простую модель")
else:
    print("Разница существенная - стоит использовать лучшую модель")

=== Сравнение результатов ===

Лучший результат Manual GridSearch: 0.90429
Лучший результат Auto GridSearch: 0.90429
Разница: 0.00000

Разница между лучшим и 5-м результатом (manual): 0.00075
Разница небольшая - можно выбрать более простую модель


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [70]:
print("=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===\n")

model_scores = {
    'SVM': svm_grid_search.best_score_,
    'Decision Tree': dt_grid_search.best_score_,
    'Random Forest (Auto)': rf_grid_search.best_score_,
    'Random Forest (Manual)': best_manual_score
}

for model_name, score in model_scores.items():
    print(f"{model_name}: {score:.5f}")

best_model_name = max(model_scores, key=model_scores.get)
best_cv_score = model_scores[best_model_name]

=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===

SVM: 0.87611
Decision Tree: 0.87386
Random Forest (Auto): 0.90429
Random Forest (Manual): 0.90429


In [71]:
print(f"Лучшая модель: {best_model_name} с CV score: {best_cv_score:.5f}")

if best_model_name == 'SVM':
    best_model = svm_grid_search.best_estimator_
    best_params = svm_grid_search.best_params_
elif best_model_name == 'Decision Tree':
    best_model = dt_grid_search.best_estimator_
    best_params = dt_grid_search.best_params_
elif 'Random Forest' in best_model_name:
    best_model = rf_grid_search.best_estimator_
    best_params = rf_grid_search.best_params_

print(f"Лучшие параметры: {best_params}")

Лучшая модель: Random Forest (Auto) с CV score: 0.90429
Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100, 'random_state': 21}


In [72]:
%%time

print("=== Финальные предсказания ===\n")

if not hasattr(best_model, 'classes_'):
    best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Финальная точность на тестовом наборе: {test_accuracy:.5f}")
print(f"CV Score: {best_cv_score:.5f}")
print(f"\nРазница (CV - Test): {best_cv_score - test_accuracy:.5f}")

if abs(best_cv_score - test_accuracy) > 0.05:
    print("⚠️  Большая разница между CV и Test - возможно переобучение")
else:
    print("✅ Разница между CV и Test приемлемая")

=== Финальные предсказания ===

Финальная точность на тестовом наборе: 0.92604
CV Score: 0.90429

Разница (CV - Test): -0.02174
✅ Разница между CV и Test приемлемая
CPU times: user 25.7 ms, sys: 3.9 ms, total: 29.6 ms
Wall time: 27.6 ms


In [73]:
print("=== Детальный анализ результатов ===\n")

print("Classification Report:")
weekdays = ['Понедельник', 'Вторник', 'Среда', 'Четверг', 'Пятница', 'Суббота', 'Воскресенье']
print(classification_report(y_test, y_pred, target_names=weekdays))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

=== Детальный анализ результатов ===

Classification Report:
              precision    recall  f1-score   support

 Понедельник       0.91      0.78      0.84        27
     Вторник       0.96      0.93      0.94        55
       Среда       0.93      0.93      0.93        30
     Четверг       0.95      0.96      0.96        80
     Пятница       1.00      0.86      0.92        21
     Суббота       0.86      0.91      0.88        54
 Воскресенье       0.91      0.97      0.94        71

    accuracy                           0.93       338
   macro avg       0.93      0.91      0.92       338
weighted avg       0.93      0.93      0.93       338


Confusion Matrix:
[[21  1  0  1  0  1  3]
 [ 1 51  2  0  0  1  0]
 [ 0  0 28  1  0  1  0]
 [ 1  1  0 77  0  0  1]
 [ 0  0  0  0 18  3  0]
 [ 0  0  0  2  0 49  3]
 [ 0  0  0  0  0  2 69]]


In [None]:
print("Анализ ошибок по дням недели\n")
error_rates = {}

for i, day in enumerate(weekdays):
    total_samples = sum(cm[i, :])
    correct_predictions = cm[i, i]
    if total_samples > 0:
        error_rate = (total_samples - correct_predictions) / total_samples * 100
        error_rates[day] = error_rate
        print(f"{day} (класс {i}): {error_rate:.2f}% ошибок ({total_samples - correct_predictions}/{total_samples} ошибок)")

if error_rates:
    worst_day = max(error_rates, key=error_rates.get)
    worst_error_rate = error_rates[worst_day]
    print(f"\nДень с наибольшим количеством ошибок: {worst_day} ({worst_error_rate:.2f}% ошибок)")
    
    best_day = min(error_rates, key=error_rates.get)
    best_error_rate = error_rates[best_day]
    print(f"День с наименьшим количеством ошибок: {best_day} ({best_error_rate:.2f}% ошибок)")

=== Анализ ошибок по дням недели ===

Понедельник (класс 0): 22.22% ошибок (6/27 ошибок)
Вторник (класс 1): 7.27% ошибок (4/55 ошибок)
Среда (класс 2): 6.67% ошибок (2/30 ошибок)
Четверг (класс 3): 3.75% ошибок (3/80 ошибок)
Пятница (класс 4): 14.29% ошибок (3/21 ошибок)
Суббота (класс 5): 9.26% ошибок (5/54 ошибок)
Воскресенье (класс 6): 2.82% ошибок (2/71 ошибок)

День с наибольшим количеством ошибок: Понедельник (22.22% ошибок)
День с наименьшим количеством ошибок: Воскресенье (2.82% ошибок)
