# Day 09. Exercise 00
# Regularization

### Запуск контейнера с нужными версиями

docker run -d \
  --platform linux/amd64 \
  -p 8888:8888 \
  -v $(pwd):/home/jovyan/work \
  --name sklearn \
  jupyter/scipy-notebook:python-3.8 \
  bash -c "pip install scikit-learn==0.23.1 && start-notebook.sh --NotebookApp.token=''"

#### выбираем правильный kernel в vscode на localhost (который отдает докер)

In [189]:
import sys
print("Python версия:", sys.version)

import sklearn
print("scikit-learn версия:", sklearn.__version__)

import pandas as pd
print("pandas версия:", pd.__version__)

import numpy as np
print("numpy версия:", np.__version__)

Python версия: 3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:10) 
[GCC 10.3.0]
scikit-learn версия: 0.23.1
pandas версия: 1.5.0
numpy версия: 1.23.3


## 0. Imports

In [190]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import json

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [191]:
df = pd.read_csv('work/src/data/dayofweek.csv')
print(f"Размерность данных: {df.shape}")
print(f"Колонки в файле: {df.columns.tolist()}")

df.head()

Размерность данных: (1686, 44)
Колонки в файле: ['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1', 'dayofweek']


Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,-0.788667,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,-0.756764,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,-0.724861,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,-0.692958,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,-0.661055,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [192]:
print("Распределение целевой переменной:")
df['dayofweek'].value_counts().sort_index()


Распределение целевой переменной:


0    136
1    274
2    149
3    396
4    104
5    271
6    356
Name: dayofweek, dtype: int64

In [None]:
print("\nПропущенные значения:")
print(df.isnull().sum())


Пропущенные значения:
numTrials           0
hour                0
uid_user_0          0
uid_user_1          0
uid_user_10         0
uid_user_11         0
uid_user_12         0
uid_user_13         0
uid_user_14         0
uid_user_15         0
uid_user_16         0
uid_user_17         0
uid_user_18         0
uid_user_19         0
uid_user_2          0
uid_user_20         0
uid_user_21         0
uid_user_22         0
uid_user_23         0
uid_user_24         0
uid_user_25         0
uid_user_26         0
uid_user_27         0
uid_user_28         0
uid_user_29         0
uid_user_3          0
uid_user_30         0
uid_user_31         0
uid_user_4          0
uid_user_6          0
uid_user_7          0
uid_user_8          0
labname_code_rvw    0
labname_lab02       0
labname_lab03       0
labname_lab03s      0
labname_lab05s      0
labname_laba04      0
labname_laba04s     0
labname_laba05      0
labname_laba06      0
labname_laba06s     0
labname_project1    0
dayofweek           0
dtype: in

In [194]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

print("Размерность признаков:", X.shape)
print("Размерность целевой переменной:", y.shape)
print("\nИнформация о признаках:")
print(X.info())

Размерность признаков: (1686, 43)
Размерность целевой переменной: (1686,)

Информация о признаках:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1686 non-null   float64
 1   hour              1686 non-null   float64
 2   uid_user_0        1686 non-null   float64
 3   uid_user_1        1686 non-null   float64
 4   uid_user_10       1686 non-null   float64
 5   uid_user_11       1686 non-null   float64
 6   uid_user_12       1686 non-null   float64
 7   uid_user_13       1686 non-null   float64
 8   uid_user_14       1686 non-null   float64
 9   uid_user_15       1686 non-null   float64
 10  uid_user_16       1686 non-null   float64
 11  uid_user_17       1686 non-null   float64
 12  uid_user_18       1686 non-null   float64
 13  uid_user_19       1686 non-null   float64
 14  uid_user_2        1686 non-null   flo

In [195]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=21, 
    stratify=y
)

print("Размерность тренировочного набора:", X_train.shape)
print("Размерность тестового набора:", X_test.shape)
print("\nРаспределение в тренировочной выборке:")
print(y_train.value_counts().sort_index())
print("Распределение в тестовой выборке:")
print(y_test.value_counts().sort_index())

Размерность тренировочного набора: (1348, 43)
Размерность тестового набора: (338, 43)

Распределение в тренировочной выборке:
0    109
1    219
2    119
3    316
4     83
5    217
6    285
Name: dayofweek, dtype: int64
Распределение в тестовой выборке:
0    27
1    55
2    30
3    80
4    21
5    54
6    71
Name: dayofweek, dtype: int64


## Функция для тестов

In [196]:
def evaluate_model_cv(model, X_train, y_train, cv_folds=10):
    """
    Оценка модели с помощью стратифицированной k-fold кросс-валидации
    """
    skf = StratifiedKFold(n_splits=cv_folds)
    
    train_scores = []
    val_scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_fold_train, y_fold_train)
        
        train_score = model.score(X_fold_train, y_fold_train)
        val_score = model.score(X_fold_val, y_fold_val)
        
        train_scores.append(train_score)
        val_scores.append(val_score)
        
        print(f"train -  {train_score:.5f}   |   valid -  {val_score:.5f}")
    
    avg_val_score = np.mean(val_scores)
    std_val_score = np.std(val_scores)
    
    print(f"Average accuracy on crossval is {avg_val_score:.5f}")
    print(f"Std is {std_val_score:.5f}\n")
    
    return avg_val_score, std_val_score

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [197]:
%%time
logreg_baseline = LogisticRegression(random_state=21, fit_intercept=False)

print("Logistic Regression - Базовая модель (стандартная регуляризация):")
logreg_baseline_score, logreg_baseline_std = evaluate_model_cv(logreg_baseline, X_train, y_train)

Logistic Regression - Базовая модель (стандартная регуляризация):
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943

CPU times: user 3.94 s, sys: 6.27 s, total: 10.2 s
Wall time: 1.05 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [198]:
%%time

print("=== Logistic Regression с разными типами регуляризации ===\n")

print("1. Без регуляризации (penalty='none', lbfgs):")
logreg_none = LogisticRegression(random_state=21, fit_intercept=False, penalty='none', solver='lbfgs', max_iter=1000)
logreg_none_score, _ = evaluate_model_cv(logreg_none, X_train, y_train)

print("2. L1 регуляризация (penalty='l1', lbfgs):")
print("НЕ ПОДДЕРЖИВАЕТСЯ\n")

print("3. L2 регуляризация (penalty='l2', lbfgs):")
logreg_l2 = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', solver='lbfgs', max_iter=1000)
logreg_l2_score, _ = evaluate_model_cv(logreg_l2, X_train, y_train)

print("4. Elastic Net регуляризация (penalty='elasticnet', lbfgs):")
print("НЕ ПОДДЕРЖИВАЕТСЯ\n")

=== Logistic Regression с разными типами регуляризации ===

1. Без регуляризации (penalty='none', lbfgs):
train -  0.66694   |   valid -  0.63704
train -  0.65787   |   valid -  0.65926
train -  0.66694   |   valid -  0.57778
train -  0.66529   |   valid -  0.62963
train -  0.66694   |   valid -  0.62222
train -  0.65952   |   valid -  0.57778
train -  0.65045   |   valid -  0.69630
train -  0.68425   |   valid -  0.61481
train -  0.66474   |   valid -  0.62687
train -  0.65651   |   valid -  0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379

2. L1 регуляризация (penalty='l1', lbfgs):
НЕ ПОДДЕРЖИВАЕТСЯ

3. L2 регуляризация (penalty='l2', lbfgs):
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64

In [199]:
print(f"logreg_none_score: {logreg_none_score}")
print(f"logreg_l2_score:   {logreg_l2_score}")

logreg_none_score: 0.6246158098396905
logreg_l2_score:   0.6016473189607519


In [200]:
%%time

print("1. Без регуляризации (penalty='none', liblinear):")
print("НЕ ПОДДЕРЖИВАЕТСЯ\n")

print("2. L1 регуляризация (penalty='l1', liblinear):")
logreg_l1 = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='liblinear', max_iter=1000)
logreg_l1_score, _ = evaluate_model_cv(logreg_l1, X_train, y_train)

print("3. L2 регуляризация (penalty='l2', liblinear):")
logreg_l2 = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', solver='liblinear', max_iter=1000)
logreg_l2_score, _ = evaluate_model_cv(logreg_l2, X_train, y_train)

print("4. Elastic Net регуляризация (penalty='elasticnet', liblinear):")
print("НЕ ПОДДЕРЖИВАЕТСЯ\n")

1. Без регуляризации (penalty='none', liblinear):
НЕ ПОДДЕРЖИВАЕТСЯ

2. L1 регуляризация (penalty='l1', liblinear):
train -  0.61830   |   valid -  0.54815
train -  0.62737   |   valid -  0.62222
train -  0.60511   |   valid -  0.54074
train -  0.63644   |   valid -  0.62222
train -  0.62407   |   valid -  0.55556
train -  0.62325   |   valid -  0.58519
train -  0.61253   |   valid -  0.63704
train -  0.64716   |   valid -  0.58519
train -  0.63015   |   valid -  0.59701
train -  0.61367   |   valid -  0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129

3. L2 регуляризация (penalty='l2', liblinear):
train -  0.61006   |   valid -  0.56296
train -  0.61665   |   valid -  0.61481
train -  0.61336   |   valid -  0.59259
train -  0.62902   |   valid -  0.60741
train -  0.60923   |   valid -  0.55556
train -  0.61500   |   valid -  0.57778
train -  0.61665   |   valid -  0.61481
train -  0.64056   |   valid -  0.53333
train -  0.62109   |   valid -  0.58209
train -  0.61120   | 

In [201]:
print(f"logreg_l1_score: {logreg_l1_score}")
print(f"logreg_l2_score: {logreg_l2_score}")

logreg_l1_score: 0.5890326147042565
logreg_l2_score: 0.5815975677169707


In [202]:
%%time

print("1. Без регуляризации (penalty='none', saga):")
logreg_none = LogisticRegression(random_state=21, fit_intercept=False, penalty='none', solver='saga', max_iter=1000)
logreg_none_score, _ = evaluate_model_cv(logreg_none, X_train, y_train)

print("2. L1 регуляризация (penalty='l1', saga):")
logreg_l1 = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='saga', max_iter=1000)
logreg_l1_score, _ = evaluate_model_cv(logreg_l1, X_train, y_train)

print("3. L2 регуляризация (penalty='l2', saga):")
logreg_l2 = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', solver='saga', max_iter=1000)
logreg_l2_score, _ = evaluate_model_cv(logreg_l2, X_train, y_train)

print("4. Elastic Net регуляризация (penalty='elasticnet'):")
logreg_elastic = LogisticRegression(random_state=21, fit_intercept=False, penalty='elasticnet', 
                                  solver='saga', l1_ratio=0.5, max_iter=1000)
logreg_elastic_score, _ = evaluate_model_cv(logreg_elastic, X_train, y_train)

1. Без регуляризации (penalty='none', saga):




train -  0.66612   |   valid -  0.63704




train -  0.65787   |   valid -  0.65926




train -  0.66612   |   valid -  0.57778




train -  0.66529   |   valid -  0.62963




train -  0.66694   |   valid -  0.61481




train -  0.65952   |   valid -  0.57778




train -  0.65045   |   valid -  0.69630




train -  0.68425   |   valid -  0.61481




train -  0.66474   |   valid -  0.62687




train -  0.65651   |   valid -  0.60448
Average accuracy on crossval is 0.62388
Std is 0.03392

2. L1 регуляризация (penalty='l1', saga):
train -  0.63726   |   valid -  0.58519
train -  0.64221   |   valid -  0.61481
train -  0.62984   |   valid -  0.55556
train -  0.64386   |   valid -  0.60000
train -  0.63232   |   valid -  0.57778
train -  0.63644   |   valid -  0.57778
train -  0.63644   |   valid -  0.65926
train -  0.65622   |   valid -  0.57778
train -  0.64580   |   valid -  0.58955
train -  0.63839   |   valid -  0.62687
Average accuracy on crossval is 0.59646
Std is 0.02848

3. L2 регуляризация (penalty='l2', saga):
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64221   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
trai

### Ошибка ConvergenceWarning означает, что алгоритм оптимизации не сошелся за максимальное количество итераций.
SAGA - это итеративный алгоритм оптимизации, который пытается найти минимум функции потерь. 
В нашем случае он не успел "дойти" до оптимального решения за 1000 итераций (max_iter=1000).

### Почему только "No regularization" не сходится:
#### Без регуляризации:

- Алгоритм пытается найти "идеальные" коэффициенты без ограничений
- Может "гнаться" за переобучением на тренировочных данных
- Функция потерь становится сложной и "изрезанной"
- Алгоритм "мечется" в поисках глобального минимума
- Результат: плохая сходимость, но высокая точность на train (0.66-0.68)

#### С регуляризацией (L1, L2, Elastic Net):

- Добавляется штраф за большие веса коэффициентов
- Функция потерь становится более гладкой и выпуклой
- Алгоритм "знает куда идти" - к более простому решению
- Результат: быстрая сходимость, но чуть меньше переобучения

#### В машинном обучении алгоритм ищет лучшие коэффициенты для модели:
✅ Сошелся - алгоритм нашел оптимум и перестал улучшаться

❌ Не сошелся - время кончилось (max_iter), а он все еще ищет

In [203]:
print(f"logreg_none_score: {logreg_none_score}")
print(f"logreg_l1_score: {logreg_l1_score}")
print(f"logreg_l2_score: {logreg_l2_score}")
print(f"logreg_elastic_score: {logreg_elastic_score}")

logreg_none_score: 0.6238750690989499
logreg_l1_score: 0.5964566058595908
logreg_l2_score: 0.6016473189607519
logreg_elastic_score: 0.5979325594250968


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [204]:
%%time

svm_baseline = SVC(probability=True, kernel='linear', random_state=21)

print("SVM - Базовая модель (стандартная регуляризация):")
svm_baseline_score, svm_baseline_std = evaluate_model_cv(svm_baseline, X_train, y_train)

SVM - Базовая модель (стандартная регуляризация):
train -  0.70486   |   valid -  0.65926
train -  0.69662   |   valid -  0.75556
train -  0.69415   |   valid -  0.62222
train -  0.70239   |   valid -  0.65185
train -  0.69085   |   valid -  0.65185
train -  0.68920   |   valid -  0.64444
train -  0.69250   |   valid -  0.72593
train -  0.70074   |   valid -  0.62222
train -  0.69605   |   valid -  0.61940
train -  0.71087   |   valid -  0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359

CPU times: user 4.22 s, sys: 731 ms, total: 4.95 s
Wall time: 3.8 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [205]:
%%time

print("=== SVM с разными значениями C (сила регуляризации) ===\n")

c_values = [0.01, 0.1, 1.0, 10.0, 100.0]
svm_scores = {}

for c in c_values:
    print(f"SVM с C={c}:")
    svm_model = SVC(probability=True, kernel='linear', random_state=21, C=c)
    score, _ = evaluate_model_cv(svm_model, X_train, y_train)
    svm_scores[c] = score

=== SVM с разными значениями C (сила регуляризации) ===

SVM с C=0.01:
train -  0.37923   |   valid -  0.40000
train -  0.37923   |   valid -  0.40000
train -  0.38417   |   valid -  0.35556
train -  0.35449   |   valid -  0.36296
train -  0.38252   |   valid -  0.37037
train -  0.38087   |   valid -  0.38519
train -  0.37923   |   valid -  0.40000
train -  0.38252   |   valid -  0.37037
train -  0.38468   |   valid -  0.35075
train -  0.38386   |   valid -  0.35821
Average accuracy on crossval is 0.37534
Std is 0.01848

SVM с C=0.1:
train -  0.58120   |   valid -  0.55556
train -  0.57543   |   valid -  0.56296
train -  0.57378   |   valid -  0.57037
train -  0.59275   |   valid -  0.57037
train -  0.58120   |   valid -  0.54815
train -  0.57955   |   valid -  0.54815
train -  0.57296   |   valid -  0.61481
train -  0.59192   |   valid -  0.54815
train -  0.59967   |   valid -  0.52985
train -  0.57825   |   valid -  0.57463
Average accuracy on crossval is 0.56230
Std is 0.02177

SVM 

In [206]:
print("=== Сводка результатов SVM ===\n")
print(f"Базовая модель (C=1.0): {svm_baseline_score:.5f}")
for c, score in svm_scores.items():
    print(f"C={c}: {score:.5f}")

best_c = max(svm_scores, key=svm_scores.get)
print(f"\nЛучшее значение C: {best_c} с результатом: {svm_scores[best_c]:.5f}")

=== Сводка результатов SVM ===

Базовая модель (C=1.0): 0.65871
C=0.01: 0.37534
C=0.1: 0.56230
C=1.0: 0.65871
C=10.0: 0.72771
C=100.0: 0.75589

Лучшее значение C: 100.0 с результатом: 0.75589


In [207]:
svm_best = SVC(probability=True, kernel='linear', random_state=21, C=best_c)
svm_best_score = svm_scores[best_c]

svm_best_score

0.7558872305140961

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [208]:
%%time

tree_baseline = DecisionTreeClassifier(max_depth=10, random_state=21)

print("Decision Tree - Базовая модель (max_depth=10):")
tree_baseline_score, tree_baseline_std = evaluate_model_cv(tree_baseline, X_train, y_train)

Decision Tree - Базовая модель (max_depth=10):
train -  0.81039   |   valid -  0.74074
train -  0.77741   |   valid -  0.74074
train -  0.83347   |   valid -  0.70370
train -  0.79720   |   valid -  0.76296
train -  0.82440   |   valid -  0.75556
train -  0.80379   |   valid -  0.68889
train -  0.80709   |   valid -  0.76296
train -  0.80132   |   valid -  0.65926
train -  0.80807   |   valid -  0.75373
train -  0.80478   |   valid -  0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562

CPU times: user 64.9 ms, sys: 0 ns, total: 64.9 ms
Wall time: 64 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [209]:
%%time

print("=== Decision Tree с разными значениями max_depth ===\n")

max_depth_values = [3, 5, 7, 10, 15, 20, None]
tree_scores = {}

for depth in max_depth_values:
    print(f"Decision Tree с max_depth={depth}:")
    tree_model = DecisionTreeClassifier(max_depth=depth, random_state=21)
    score, _ = evaluate_model_cv(tree_model, X_train, y_train)
    tree_scores[depth] = score

=== Decision Tree с разными значениями max_depth ===

Decision Tree с max_depth=3:
train -  0.49382   |   valid -  0.43704


train -  0.48887   |   valid -  0.48148
train -  0.50206   |   valid -  0.44444
train -  0.49629   |   valid -  0.49630
train -  0.48475   |   valid -  0.48889
train -  0.48969   |   valid -  0.48889
train -  0.48392   |   valid -  0.48148
train -  0.49052   |   valid -  0.40741
train -  0.48517   |   valid -  0.46269
train -  0.49176   |   valid -  0.42537
Average accuracy on crossval is 0.46140
Std is 0.02938

Decision Tree с max_depth=5:
train -  0.59522   |   valid -  0.53333
train -  0.56307   |   valid -  0.53333
train -  0.60181   |   valid -  0.55556
train -  0.59604   |   valid -  0.57037
train -  0.60264   |   valid -  0.57778
train -  0.57955   |   valid -  0.53333
train -  0.58368   |   valid -  0.54815
train -  0.59275   |   valid -  0.51111
train -  0.58237   |   valid -  0.56716
train -  0.60132   |   valid -  0.50000
Average accuracy on crossval is 0.54301
Std is 0.02423

Decision Tree с max_depth=7:
train -  0.70322   |   valid -  0.64444
train -  0.67271   |   valid -

In [210]:
%%time

print("=== Decision Tree с другими параметрами регуляризации ===\n")

best_depth = max(tree_scores, key=tree_scores.get)
print(f"Используем лучшую max_depth={best_depth}")

print("Тестируем разные значения min_samples_split:")
min_samples_split_values = [2, 5, 10, 20]
tree_min_split_scores = {}

for min_split in min_samples_split_values:
    print(f"\nDecision Tree с max_depth={best_depth}, min_samples_split={min_split}:")
    tree_model = DecisionTreeClassifier(max_depth=best_depth, min_samples_split=min_split, random_state=21)
    score, _ = evaluate_model_cv(tree_model, X_train, y_train)
    tree_min_split_scores[min_split] = score

=== Decision Tree с другими параметрами регуляризации ===

Используем лучшую max_depth=20
Тестируем разные значения min_samples_split:

Decision Tree с max_depth=20, min_samples_split=2:
train -  0.98846   |   valid -  0.86667
train -  0.99011   |   valid -  0.91111
train -  0.98681   |   valid -  0.85926
train -  0.98763   |   valid -  0.91111
train -  0.98928   |   valid -  0.88148
train -  0.98186   |   valid -  0.85926
train -  0.98846   |   valid -  0.91852
train -  0.99176   |   valid -  0.89630
train -  0.99094   |   valid -  0.88060
train -  0.98847   |   valid -  0.88060
Average accuracy on crossval is 0.88649
Std is 0.02075


Decision Tree с max_depth=20, min_samples_split=5:
train -  0.97197   |   valid -  0.85926
train -  0.97197   |   valid -  0.88148
train -  0.97032   |   valid -  0.84444
train -  0.97279   |   valid -  0.89630
train -  0.97032   |   valid -  0.88148
train -  0.96538   |   valid -  0.85926
train -  0.96867   |   valid -  0.91111
train -  0.96867   |   va

In [211]:
print("=== Сводка результатов Decision Tree ===\n")
print(f"Базовая модель (max_depth=10): {tree_baseline_score:.5f}")
print("\nЭксперименты с max depth:")
for depth, score in tree_scores.items():
    print(f"max_depth={depth}: {score:.5f}")

print("\nЭксперименты с min samples split:")
for min_split, score in tree_min_split_scores.items():
    print(f"min_samples_split={min_split}: {score:.5f}")

best_depth = max(tree_scores, key=tree_scores.get)
best_min_split = max(tree_min_split_scores, key=tree_min_split_scores.get)
tree_best_score = max(max(tree_scores.values()), max(tree_min_split_scores.values()))

print(f"\nЛучшие параметры: max_depth={best_depth}, min_samples_split={best_min_split}")
print(f"Лучший результат: {tree_best_score:.5f}")

tree_best = DecisionTreeClassifier(max_depth=best_depth, min_samples_split=best_min_split, random_state=21)

=== Сводка результатов Decision Tree ===

Базовая модель (max_depth=10): 0.72551

Эксперименты с max depth:
max_depth=3: 0.46140
max_depth=5: 0.54301
max_depth=7: 0.64989
max_depth=10: 0.72551
max_depth=15: 0.85459
max_depth=20: 0.88649
max_depth=None: 0.88575

Эксперименты с min samples split:
min_samples_split=2: 0.88649
min_samples_split=5: 0.87685
min_samples_split=10: 0.84716
min_samples_split=20: 0.78039

Лучшие параметры: max_depth=20, min_samples_split=2
Лучший результат: 0.88649


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [212]:
%%time

rf_baseline = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

print("Random Forest - Базовая модель (n_estimators=50, max_depth=14):")
rf_baseline_score, rf_baseline_std = evaluate_model_cv(rf_baseline, X_train, y_train)

Random Forest - Базовая модель (n_estimators=50, max_depth=14):
train -  0.96455   |   valid -  0.88148
train -  0.96208   |   valid -  0.91852
train -  0.96785   |   valid -  0.86667
train -  0.96455   |   valid -  0.89630
train -  0.96538   |   valid -  0.91111
train -  0.96538   |   valid -  0.88148
train -  0.97115   |   valid -  0.91852
train -  0.96867   |   valid -  0.85185
train -  0.97364   |   valid -  0.88060
train -  0.97941   |   valid -  0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204

CPU times: user 903 ms, sys: 0 ns, total: 903 ms
Wall time: 900 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [213]:
%%time

print("=== Random Forest с разными значениями max_depth ===\n")

rf_max_depth_values = [5, 10, 14, 20, None]
rf_depth_scores = {}

for depth in rf_max_depth_values:
    print(f"Random Forest с n_estimators=50, max_depth={depth}:")
    rf_model = RandomForestClassifier(n_estimators=50, max_depth=depth, random_state=21)
    score, _ = evaluate_model_cv(rf_model, X_train, y_train)
    rf_depth_scores[depth] = score

=== Random Forest с разными значениями max_depth ===

Random Forest с n_estimators=50, max_depth=5:
train -  0.62984   |   valid -  0.58519
train -  0.57791   |   valid -  0.55556
train -  0.60181   |   valid -  0.57778
train -  0.59522   |   valid -  0.59259
train -  0.61748   |   valid -  0.57778
train -  0.59687   |   valid -  0.57778
train -  0.61500   |   valid -  0.60741
train -  0.59522   |   valid -  0.57037
train -  0.59967   |   valid -  0.57463
train -  0.60297   |   valid -  0.54478
Average accuracy on crossval is 0.57638
Std is 0.01668

Random Forest с n_estimators=50, max_depth=10:
train -  0.85408   |   valid -  0.75556
train -  0.85903   |   valid -  0.82963
train -  0.89777   |   valid -  0.80741
train -  0.90107   |   valid -  0.82222
train -  0.88376   |   valid -  0.85185
train -  0.87799   |   valid -  0.75556
train -  0.87222   |   valid -  0.82963
train -  0.86480   |   valid -  0.74074
train -  0.89456   |   valid -  0.82090
train -  0.87644   |   valid -  0.738

In [214]:
%%time

print("=== Random Forest с разными значениями n_estimators ===\n")

best_rf_depth = max(rf_depth_scores, key=rf_depth_scores.get)
print(f"Используем лучшую max_depth={best_rf_depth}")

n_estimators_values = [25, 50, 100, 200]
rf_n_est_scores = {}

for n_est in n_estimators_values:
    print(f"\nRandom Forest с n_estimators={n_est}, max_depth={best_rf_depth}:")
    rf_model = RandomForestClassifier(n_estimators=n_est, max_depth=best_rf_depth, random_state=21)
    score, _ = evaluate_model_cv(rf_model, X_train, y_train)
    rf_n_est_scores[n_est] = score

=== Random Forest с разными значениями n_estimators ===

Используем лучшую max_depth=None

Random Forest с n_estimators=25, max_depth=None:
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.94815
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.93333
train -  0.99918   |   valid -  0.90370
train -  1.00000   |   valid -  0.89630
train -  1.00000   |   valid -  0.91852
train -  1.00000   |   valid -  0.88889
train -  1.00000   |   valid -  0.92537
train -  0.99918   |   valid -  0.88806
Average accuracy on crossval is 0.91097
Std is 0.01880


Random Forest с n_estimators=50, max_depth=None:
train -  1.00000   |   valid -  0.89630
train -  1.00000   |   valid -  0.94815
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.93333
train -  1.00000   |   valid -  0.91111
train -  1.00000   |   valid -  0.89630
train -  1.00000   |   valid -  0.91852
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.

In [215]:
%%time

print("=== Random Forest с другими параметрами регуляризации ===\n")

best_rf_n_est = max(rf_n_est_scores, key=rf_n_est_scores.get)
print(f"Используем лучшие n_estimators={best_rf_n_est}, max_depth={best_rf_depth}")

print("Тестируем разные значения min_samples_split:")
rf_min_split_values = [2, 5, 10]
rf_min_split_scores = {}

for min_split in rf_min_split_values:
    print(f"\nRandom Forest с n_estimators={best_rf_n_est}, max_depth={best_rf_depth}, min_samples_split={min_split}:")
    rf_model = RandomForestClassifier(n_estimators=best_rf_n_est, max_depth=best_rf_depth, 
                                    min_samples_split=min_split, random_state=21)
    score, _ = evaluate_model_cv(rf_model, X_train, y_train)
    rf_min_split_scores[min_split] = score

=== Random Forest с другими параметрами регуляризации ===

Используем лучшие n_estimators=200, max_depth=None
Тестируем разные значения min_samples_split:

Random Forest с n_estimators=200, max_depth=None, min_samples_split=2:
train -  1.00000   |   valid -  0.89630
train -  1.00000   |   valid -  0.94815
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.93333
train -  1.00000   |   valid -  0.91111
train -  1.00000   |   valid -  0.88889
train -  1.00000   |   valid -  0.92593
train -  1.00000   |   valid -  0.90370
train -  1.00000   |   valid -  0.92537
train -  1.00000   |   valid -  0.91045
Average accuracy on crossval is 0.91469
Std is 0.01727


Random Forest с n_estimators=200, max_depth=None, min_samples_split=5:
train -  0.99011   |   valid -  0.89630
train -  0.98928   |   valid -  0.93333
train -  0.98928   |   valid -  0.88148
train -  0.99011   |   valid -  0.92593
train -  0.99258   |   valid -  0.91852
train -  0.99176   |   valid -  0.89630
train

In [216]:
print("=== Сводка результатов Random Forest ===\n")
print(f"Базовая модель (n_estimators=50, max_depth=14): {rf_baseline_score:.5f}")

print("\nЭксперименты с max depth:")
for depth, score in rf_depth_scores.items():
    print(f"max_depth={depth}: {score:.5f}")

print("\nЭксперименты с n estimators:")
for n_est, score in rf_n_est_scores.items():
    print(f"n_estimators={n_est}: {score:.5f}")

print("\nЭксперименты с min samples split:")
for min_split, score in rf_min_split_scores.items():
    print(f"min_samples_split={min_split}: {score:.5f}")

best_rf_depth = max(rf_depth_scores, key=rf_depth_scores.get)
best_rf_n_est = max(rf_n_est_scores, key=rf_n_est_scores.get)
best_rf_min_split = max(rf_min_split_scores, key=rf_min_split_scores.get)
rf_best_score = max(max(rf_depth_scores.values()), max(rf_n_est_scores.values()), max(rf_min_split_scores.values()))

print(f"\nЛучшие параметры: n_estimators={best_rf_n_est}, max_depth={best_rf_depth}, min_samples_split={best_rf_min_split}")
print(f"Лучший результат: {rf_best_score:.5f}")

rf_best = RandomForestClassifier(n_estimators=best_rf_n_est, max_depth=best_rf_depth, 
                                min_samples_split=best_rf_min_split, random_state=21)

=== Сводка результатов Random Forest ===

Базовая модель (n_estimators=50, max_depth=14): 0.88722

Эксперименты с max depth:
max_depth=5: 0.57638
max_depth=10: 0.79523
max_depth=14: 0.88722
max_depth=20: 0.90874
max_depth=None: 0.91395

Эксперименты с n estimators:
n_estimators=25: 0.91097
n_estimators=50: 0.91395
n_estimators=100: 0.91468
n_estimators=200: 0.91469

Эксперименты с min samples split:
min_samples_split=2: 0.91469
min_samples_split=5: 0.90502
min_samples_split=10: 0.88724

Лучшие параметры: n_estimators=200, max_depth=None, min_samples_split=2
Лучший результат: 0.91469


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [217]:
print("=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===\n")
print(f"Logistic Regression (лучшая): {max(logreg_baseline_score, logreg_none_score, logreg_l1_score, logreg_l2_score, logreg_elastic_score):.5f}")
print(f"SVM (лучшая): {svm_best_score:.5f}")
print(f"Decision Tree (лучшая): {tree_best_score:.5f}")
print(f"Random Forest (лучшая): {rf_best_score:.5f}")

model_scores = {
    'Logistic Regression': max(logreg_baseline_score, logreg_none_score, logreg_l1_score, logreg_l2_score, logreg_elastic_score),
    'SVM': svm_best_score,
    'Decision Tree': tree_best_score,
    'Random Forest': rf_best_score
}

best_model_name = max(model_scores, key=model_scores.get)
best_score = model_scores[best_model_name]

print(f"\nЛучшая модель: {best_model_name} с результатом кросс-валидации: {best_score:.5f}")

=== СРАВНЕНИЕ ФИНАЛЬНЫХ МОДЕЛЕЙ ===

Logistic Regression (лучшая): 0.62388
SVM (лучшая): 0.75589
Decision Tree (лучшая): 0.88649
Random Forest (лучшая): 0.91469

Лучшая модель: Random Forest с результатом кросс-валидации: 0.91469


In [218]:
best_model = RandomForestClassifier(n_estimators=best_rf_n_est, max_depth=best_rf_depth, 
                                    min_samples_split=best_rf_min_split, random_state=21)
print(f"Используем Random Forest с n_estimators={best_rf_n_est}, max_depth={best_rf_depth}, min_samples_split={best_rf_min_split}")
best_model.fit(X_train, y_train)

Используем Random Forest с n_estimators=200, max_depth=None, min_samples_split=2


RandomForestClassifier(n_estimators=200, random_state=21)

In [219]:
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Финальная точность на тестовом наборе: {test_accuracy}")
print(f"\nОтчет о классификации:")
print(classification_report(y_test, y_pred))

Финальная точность на тестовом наборе: 0.9378698224852071

Отчет о классификации:
              precision    recall  f1-score   support

           0       0.95      0.74      0.83        27
           1       0.98      0.95      0.96        55
           2       1.00      0.93      0.97        30
           3       0.92      0.97      0.95        80
           4       0.95      0.86      0.90        21
           5       0.89      0.94      0.92        54
           6       0.93      0.99      0.96        71

    accuracy                           0.94       338
   macro avg       0.95      0.91      0.93       338
weighted avg       0.94      0.94      0.94       338



### Classification Report (Отчет о классификации)
Здесь видны метрики для каждого дня недели (классы 0-6 = понедельник-воскресенье):

- Precision (точность) - из всех дней, которые модель предсказала как "понедельник", сколько действительно были понедельниками
- Recall (полнота) - из всех реальных понедельников, сколько модель смогла правильно найти

In [220]:
print("=== Анализ ошибок по дням недели ===\n")

cm = confusion_matrix(y_test, y_pred)
print("Матрица ошибок:")
print(cm)
print()

weekdays = ['Понедельник', 'Вторник', 'Среда', 'Четверг', 'Пятница', 'Суббота', 'Воскресенье']
error_rates = {}

for i, day in enumerate(weekdays):
    total_samples = sum(cm[i, :])
    correct_predictions = cm[i, i]
    error_rate = (total_samples - correct_predictions) / total_samples * 100
    error_rates[day] = error_rate
    
    print(f"{day} (класс {i}): {error_rate:.2f}% ошибок ({total_samples - correct_predictions}/{total_samples} ошибок)")

worst_day = max(error_rates, key=error_rates.get)
worst_error_rate = error_rates[worst_day]

print(f"\nДень недели с наибольшим количеством ошибок: {worst_day} ({worst_error_rate:.2f}% ошибок)")

=== Анализ ошибок по дням недели ===

Матрица ошибок:
[[20  1  0  1  0  1  4]
 [ 0 52  0  2  0  1  0]
 [ 0  0 28  2  0  0  0]
 [ 1  0  0 78  0  1  0]
 [ 0  0  0  0 18  2  1]
 [ 0  0  0  2  1 51  0]
 [ 0  0  0  0  0  1 70]]

Понедельник (класс 0): 25.93% ошибок (7/27 ошибок)
Вторник (класс 1): 5.45% ошибок (3/55 ошибок)
Среда (класс 2): 6.67% ошибок (2/30 ошибок)
Четверг (класс 3): 2.50% ошибок (2/80 ошибок)
Пятница (класс 4): 14.29% ошибок (3/21 ошибок)
Суббота (класс 5): 5.56% ошибок (3/54 ошибок)
Воскресенье (класс 6): 1.41% ошибок (1/71 ошибок)

День недели с наибольшим количеством ошибок: Понедельник (25.93% ошибок)


### Confusion Matrix (Матрица ошибок)
Показывает, как часто модель путает дни недели:

- Диагональ (20, 52, 28, 78, 18, 51, 70) - правильные предсказания
- Остальные числа - ошибки

Главная проблема: понедельник часто путается с воскресеньем (4 ошибки)

### Error Analysis (Анализ ошибок)
Понедельник - самый проблемный день (25.93% ошибок):

- Модели сложно отличить понедельник от других дней
- Возможно, паттерны активности в понедельник менее предсказуемы
- Или данных по понедельникам недостаточно (всего 27 образцов)

#### Лучше всего модель предсказывает:

- Воскресенье (1.41% ошибок)
- Четверг (2.50% ошибок)

In [221]:
model_filename = f'work/src/ex00/model/best_model.joblib'
joblib.dump(best_model, model_filename)
print(f"Лучшая модель сохранена как: {model_filename}")

model_info = {
    'model_type': best_model_name,
    'cv_score': best_score,
    'test_accuracy': test_accuracy,
    'worst_day': worst_day,
    'worst_error_rate': worst_error_rate,
    'parameters': str(best_model.get_params())
}

with open('work/src/ex00/model/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

Лучшая модель сохранена как: work/src/ex00/model/best_model.joblib
