<a href="https://colab.research.google.com/github/polina-minaeva/model-optimization-improving-quality/blob/main/14_%D0%A3%D0%BB%D1%83%D1%87%D1%88%D0%B5%D0%BD%D0%B8%D0%B5_%D0%BA%D0%B0%D1%87%D0%B5%D1%81%D1%82%D0%B2%D0%B0_%D0%BC%D0%BE%D0%B4%D0%B5%D0%BB%D0%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

У нас есть набор данных о пациентах, которые имеют или не имеют болезнь сердца (heart.csv). Построим  модель классификации и предскажем наличие заболевания. Затем оптимизируем параметры с помощью GridSearchCV и RandomizedSearchCV и найдем лучший результат.

In [None]:
import warnings
warnings.filterwarnings('ignore')

1. Получение и загрузка данных

In [None]:
import pandas as pd

In [None]:
data_pd = pd.read_csv('heart.csv')
data_pd

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


2. Подготовка датасета к обучению моделей. Перевод категориальных переменных в числовые значения

In [None]:
#отобрала названия колонок только с категориальными переменными
categorials = data_pd.select_dtypes('object').columns
categorials

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

In [None]:
#преобразовала категориальные переменные в 0 и 1
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(data_pd[categorials])
data_pd[ohe.get_feature_names_out()] = ohe.transform(data_pd[categorials])
data_pd = data_pd.drop(categorials, axis=1)

In [None]:
data_pd.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195,0,122,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


3. Разделение выборки на обучающее и тестовое подмножество

In [None]:
#разделила выборку на тренировочную и тестовую, выделила целевую переменную в отдельный датафрейм
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_pd.drop(columns='HeartDisease'),
                                                    data_pd['HeartDisease'],
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape

((734, 20), (184, 20))

4. Обучение логистической регрессии с параметрами по умолчанию

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
fit_lr = lr.fit(X_train, y_train)
fit_lr.predict(X_test[:10])

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1])

In [None]:
fit_lr.predict_proba(X_test[:10])

array([[0.98173735, 0.01826265],
       [0.8616908 , 0.1383092 ],
       [0.02584657, 0.97415343],
       [0.01679714, 0.98320286],
       [0.9628907 , 0.0371093 ],
       [0.13008396, 0.86991604],
       [0.1249428 , 0.8750572 ],
       [0.91480081, 0.08519919],
       [0.22197981, 0.77802019],
       [0.03968718, 0.96031282]])

In [None]:
from sklearn.metrics import recall_score
recall_score(y_test, fit_lr.predict(X_test)) #метрика recall модели логистической регрессии

0.8411214953271028

In [None]:
#посмотрим также на остальные метрики и для большей объективности воспользуемся кросс-валидацией

from sklearn.model_selection import cross_validate
res = cross_validate(lr, X_train, y_train, cv=8, scoring=['accuracy','recall','precision','f1'])
res['test_recall']

array([0.94     , 0.84     , 0.9      , 0.9      , 0.9      , 0.8627451,
       0.9      , 0.9      ])

5. Оптимизация параметров модели. GridSearchCV

In [None]:
#теперь подберем лучшие параметры модели логистической регрессии, воспользуется для начала GridSearchCV

parameters = {'penalty': ['l1', 'l2', 'elasticnet', None],
              'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'max_iter': list(range(70,150))
              }

In [None]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(lr, parameters, cv=8, scoring='recall')

gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_) #лучшая метрика
print(gs.best_estimator_)

0.9002450980392156
LogisticRegression(max_iter=72, penalty=None, random_state=42)


In [None]:
print(gs.best_params_) #лучшие параметры согласно оптимизации

{'max_iter': 72, 'penalty': None, 'solver': 'lbfgs'}


In [None]:
#теперь построим модель с лучшими параметрами и посмотрим, как изменятся метрики

lr2 = LogisticRegression(penalty='l2', random_state=42, solver='liblinear', max_iter=70)
fit_lr2 = lr2.fit(X_train, y_train)
fit_lr2.predict(X_test[:10])

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1])

In [None]:
fit_lr.predict(X_test[:10])

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1])

In [None]:
res2 = cross_validate(lr2, X_train, y_train, cv=8, scoring=['accuracy','recall','precision','f1'])
res2['test_recall']

array([0.94     , 0.84     , 0.9      , 0.88     , 0.9      , 0.8627451,
       0.92     , 0.92     ])

In [None]:
res['test_recall']

array([0.94     , 0.84     , 0.9      , 0.9      , 0.9      , 0.8627451,
       0.9      , 0.9      ])

In [None]:
res['test_recall'] == res2['test_recall']

array([ True,  True,  True, False,  True,  True, False, False])

In [None]:
res['test_accuracy'] == res2['test_accuracy']

array([ True,  True,  True, False,  True, False, False, False])

6. Оптимизация параметров модели. RandomizedSearchCV

In [None]:
#теперь воспользуемся RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(lr, parameters, cv=8, scoring='recall')

rs.fit(X_train, y_train)

In [None]:
print(rs.best_score_)
print(rs.best_estimator_)

0.895343137254902
LogisticRegression(max_iter=118, random_state=42)


In [None]:
print(rs.best_params_)

{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 118}


In [None]:
#теперь построим модель с лучшими параметрами и посмотрим, как изменятся метрики

lr3 = LogisticRegression(penalty='l2', random_state=42, solver='liblinear', max_iter=131)
fit_lr3 = lr3.fit(X_train, y_train)
fit_lr3.predict(X_test[:10])

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1])

In [None]:
res3 = cross_validate(lr3, X_train, y_train, cv=8, scoring=['accuracy','recall','precision','f1'])
res3['test_recall']

array([0.94     , 0.84     , 0.9      , 0.88     , 0.9      , 0.8627451,
       0.92     , 0.92     ])

In [None]:
res3['test_recall'] == res2['test_recall']

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [None]:
res3['test_accuracy'] == res2['test_accuracy']

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [None]:
rs.best_score_ #лучшая метрика RandomSearchCV

0.895343137254902

In [None]:
recall_score(y_test, fit_lr.predict(X_test)) #вспомним метрику модели без оптимизации параметров

0.8411214953271028

In [None]:
rs.best_score_ < gs.best_score_ #сравнение метрики RandomSearchCV и GridSearchCV

True

Метрика лучше у GridSearchCV, чем у RandomizedSearchCV. Но и в одном, и в другом случае метрика recall получилась лучше, чем без оптимизации параметров модели.

При этом оба оптимизатора справились с задачей лучше, чем дерево решений и случайный лес, но хуже, чем бэггинг моделей и стекинг (модели из прошлого задания).