## Задание


Нужно решить задачу классификации наличия болезни сердца у пациентов наиболее эффективно. Целевая переменная – наличие болезни сердца (HeartDisease). Она принимает значения 0 или 1 в зависимости от отсутствия или наличия болезни соответственно. Подробное описание признаков можно прочесть в описании датасета на сайте.

In [182]:
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import pandas as pd

In [3]:
# подгружаем данные
df=pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [112]:
X = df.drop(columns = ['HeartDisease'])
y=df['HeartDisease']

In [113]:
# разделим датасет на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [114]:
# переведем категориальные переменные к значениям 1 и 0
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

Обучим модель логистической регрессии с параметрами по умолчанию

In [115]:
model = make_pipeline(StandardScaler(), LogisticRegression(random_state = 10))
model.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(random_state=10))])

In [116]:
model.score(X_test, y_test)

0.8532608695652174

In [117]:
model.score(X_train, y_train)

0.8719346049046321

In [118]:
from sklearn.model_selection import cross_validate

In [119]:
cv_results = cross_validate(model, X_train, y_train, cv=10, scoring=['accuracy','recall','precision','f1'])

In [130]:
cv_results

{'fit_time': array([0.00999403, 0.00903749, 0.00596666, 0.00600243, 0.00600243,
        0.0049994 , 0.00600052, 0.00500035, 0.0059998 , 0.0059998 ]),
 'score_time': array([0.00400019, 0.00399852, 0.00400019, 0.00299788, 0.00299788,
        0.0030005 , 0.00299978, 0.00300002, 0.00300026, 0.00300002]),
 'test_accuracy': array([0.91891892, 0.87837838, 0.89189189, 0.86486486, 0.82191781,
        0.87671233, 0.79452055, 0.87671233, 0.89041096, 0.83561644]),
 'test_recall': array([0.925     , 0.875     , 0.875     , 0.90243902, 0.875     ,
        0.9       , 0.85      , 0.9       , 0.95      , 0.9       ]),
 'test_precision': array([0.925     , 0.8974359 , 0.92105263, 0.86046512, 0.81395349,
        0.87804878, 0.79069767, 0.87804878, 0.86363636, 0.81818182]),
 'test_f1': array([0.925     , 0.88607595, 0.8974359 , 0.88095238, 0.84337349,
        0.88888889, 0.81927711, 0.88888889, 0.9047619 , 0.85714286])}

In [120]:
cv_results['test_accuracy'].mean()

0.8649944465012958

In [81]:
import warnings
warnings.filterwarnings("ignore")

Оптимизируем параметры моделей

In [None]:
# подберем лучшие гиперпараметры моделей 
models=[
    {'name':'Lr',"model": LogisticRegression()  , 'params':{'C':[0.1,0.2,0.3,0.5,0.7,1], 'penalty':['l1', 'l2']}},
      {'name':'RF',"model": RandomForestClassifier(), 'params':{'n_estimators':[10,100,150,200], 'criterion':['gini', 'entropy'], 'max_depth':[5,7,9,11]}},
    {'name':'SVC',"model": SVC(), 'params':{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto']}}, 
    {'name':'DT',"model": DecisionTreeClassifier(), 'params':{'criterion':['gini', 'entropy'], 'max_depth':[5,7,9,11]}},
    {'name':'KN',"model": KNeighborsClassifier(), 'params':{'n_neighbors':list(range(1,30)),'weights': ['uniform', 'distance'], 'p':[1,2,3]}}
]

res=[]
for v in  models:
    res.append((v['name'], GridSearchCV(v['model'], v['params'], cv=10).fit(X_train, y_train)))

In [109]:
# результаты изменения гиперпараметров
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

Lr 0.8679192891521657 {'C': 0.2, 'penalty': 'l2'}
RF 0.8869677897075157 {'criterion': 'gini', 'max_depth': 9, 'n_estimators': 150}
SVC 0.8720103665309145 {'gamma': 'scale', 'kernel': 'linear'}
DT 0.8555349870418365 {'criterion': 'entropy', 'max_depth': 7}
KN 0.8843576453165495 {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}


In [139]:
#скорректируем сетку параметров по некоторым моделям на более близкие к лучшим гиперпараметрам,
#полученным на предыдущем шаге
models=[
 {'name':'Lr',"model": LogisticRegression()  , 'params':{'C':[0.05,0.15,0.25,0.35,0.55,0.75,1], 'penalty':['l1', 'l2']}},
 {'name':'RF',"model": RandomForestClassifier(), 'params':{'n_estimators':[150,200,250,300], 'criterion':['gini', 'entropy'],  'max_depth':[8,9,10,11,12]}},
 {'name':'DT',"model": DecisionTreeClassifier(), 'params':{'criterion':[ 'entropy'], 'max_depth':[6,7,8]}}
]

res=[]
for v in  models:
    res.append((v['name'], GridSearchCV(v['model'], v['params'], cv=10).fit(X_train, y_train)))

In [140]:
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

Lr 0.8718252499074415 {'C': 0.75, 'penalty': 'l2'}
RF 0.8772676786375415 {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 200}
DT 0.8146982599037393 {'criterion': 'entropy', 'max_depth': 6}


In [136]:
Lr = LogisticRegression(C=0.75, penalty='l2', random_state=10)
Lr.fit(X_train, y_train)

LogisticRegression(C=0.75, random_state=10)

In [137]:
Lr.score(X_test, y_test)

0.8532608695652174

In [138]:
Lr.score(X_train, y_train)

0.8705722070844687

In [177]:
rf=RandomForestClassifier(criterion= "entropy", max_depth=10, n_estimators= 200, min_samples_leaf = 1, random_state=10)

In [178]:
rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=200,
                       random_state=10)

In [179]:
rf.score(X_test, y_test)

0.8858695652173914

In [180]:
rf.score(X_train, y_train)

0.9863760217983651

Лучшие значения видим на модели Random Forest, результаты улучшились за счет подбора гиперпараметров