# Предобработка

### Загружаем данные

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split

import seaborn as sns
sns.set()

df = pd.read_csv('brain_stroke.csv')
df.head()

### Проверяем наличие пропусков

In [2]:
print(df.isnull().sum())

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


### Проведем кодирование категориальных столбцов

In [3]:
df.describe(include=['object'])

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,4981,4981,4981,4981,4981
unique,2,2,4,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2907,3280,2860,2532,1838


In [4]:
onehotencoder = OneHotEncoder()

res = onehotencoder.fit_transform(df[['gender']])
df[onehotencoder.categories_[0]] = res.toarray()
df.drop('gender', axis=1, inplace=True)

res = onehotencoder.fit_transform(df[['Residence_type']])
df[onehotencoder.categories_[0]] = res.toarray()
df.drop('Residence_type', axis=1, inplace=True)

df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Rural,Urban
0,67.0,0,1,Yes,Private,228.69,36.6,formerly smoked,1,0.0,1.0,0.0,1.0
1,80.0,0,1,Yes,Private,105.92,32.5,never smoked,1,0.0,1.0,1.0,0.0
2,49.0,0,0,Yes,Private,171.23,34.4,smokes,1,1.0,0.0,0.0,1.0
3,79.0,1,0,Yes,Self-employed,174.12,24.0,never smoked,1,1.0,0.0,1.0,0.0
4,81.0,0,0,Yes,Private,186.21,29.0,formerly smoked,1,0.0,1.0,0.0,1.0


In [5]:
ordEnc = OrdinalEncoder()
df[["smoking_status", 'work_type', 'ever_married']] = ordEnc.fit_transform(
    df[["smoking_status", 'work_type', 'ever_married']])

df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Rural,Urban
0,67.0,0,1,1.0,1.0,228.69,36.6,1.0,1,0.0,1.0,0.0,1.0
1,80.0,0,1,1.0,1.0,105.92,32.5,2.0,1,0.0,1.0,1.0,0.0
2,49.0,0,0,1.0,1.0,171.23,34.4,3.0,1,1.0,0.0,0.0,1.0
3,79.0,1,0,1.0,2.0,174.12,24.0,2.0,1,1.0,0.0,1.0,0.0
4,81.0,0,0,1.0,1.0,186.21,29.0,1.0,1,0.0,1.0,0.0,1.0


### Разделим целевой и нецелевые признаки, разделим тренировочную и тестовую выборки

In [6]:
y = df['stroke']
X = df.drop(['stroke'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)

### Кодируем целевой признак

In [7]:
labEnc = LabelEncoder()
y_train = labEnc.fit_transform(y_train)
y_test = labEnc.transform(y_test)

## Балансировка
набор данных не сбалансированный по целевому признаку и если обучать модели на этом наборе получится хорошее качество для класса 0 и  плохое качество для класса 1. Поэтому надо применять методы для балансировки данных: уменьшите размер мажоритарного (преобладающий) класса  и при необходимости сгенерируйте данные для миноритарного (маленького) класса, например, с помощью метода smote

### Посмотрим на количество значений классов по целевому признаку.

In [9]:
pd.DataFrame(y_train).value_counts()


0
0    3780
1     204
Name: count, dtype: int64

### Проведем балансировку данных

In [10]:
from imblearn.combine import SMOTETomek

sampler = SMOTETomek()

X_train, y_train = sampler.fit_resample(X_train, y_train)

## Обучение
### KNN

### Нормализация нецелевых признаков

In [11]:
scaler_mm = StandardScaler()

X_train_1 = scaler_mm.fit_transform(X_train)
X_train_1 = pd.DataFrame(X_train_1, columns = scaler_mm.feature_names_in_)

X_test_1 = scaler_mm.transform(X_test)
X_test_1 = pd.DataFrame(X_test_1, columns =  scaler_mm.feature_names_in_)

X_train_1.describe()
X_test_1.describe()

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,Female,Male,Rural,Urban
count,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0,997.0
mean,-0.519237,0.028941,0.103443,-0.273929,0.10821,-0.23581,-0.159722,0.031209,0.005178,-0.005178,-0.011647,0.011647
std,1.03457,1.042235,1.202495,1.174509,1.179262,0.806551,1.177238,1.10618,1.081964,1.081964,1.086667,1.086667
min,-2.492564,-0.30518,-0.216567,-1.880613,-1.683171,-1.155711,-2.549503,-1.470875,-1.259748,-0.929715,-1.053986,-1.120097
25%,-1.362892,-0.30518,-0.216567,-1.880613,-0.305079,-0.756467,-0.999078,-1.470875,-1.259748,-0.929715,-1.053986,-1.120097
50%,-0.407855,-0.30518,-0.216567,0.583793,-0.305079,-0.463931,-0.215346,0.617797,0.929715,-0.929715,-1.053986,1.053986
75%,0.274315,-0.30518,-0.216567,0.583793,1.073012,-0.092997,0.517273,0.617797,0.929715,1.259748,1.120097,1.053986
max,1.229352,3.27675,4.617513,0.583793,2.451104,2.68193,3.345521,1.662134,0.929715,1.259748,1.120097,1.053986


### Сравнение производительности

In [12]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

print("Метрика Манхэттен")
print('k\tAccuracy\t\tPrecision')
score1 = []
for i in range(1,30):
    clf = KNeighborsClassifier(n_neighbors=i, metric='manhattan')
    clf.fit(X_train_1, y_train)
    y_pred = clf.predict(X_test_1)
    score1.append(clf.score(X_test_1, y_test))
    score1.append(metrics.f1_score(y_test, y_pred))
    print(i, score1[-2], score1[-1], sep='\t')

print("Метрика Евклида")
print('k\tAccuracy\t\tPrecision')
score2 = []
for i in range(1,30):
    clf = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    clf.fit(X_train_1, y_train)
    y_pred = clf.predict(X_test_1)
    score2.append(clf.score(X_test_1, y_test))
    score2.append(metrics.f1_score(y_test, y_pred))
    print(i, score2[-2], score2[-1], sep='\t')

Метрика Манхэттен
k	Accuracy		Precision
1	0.8986960882647944	0.10619469026548672
2	0.9207622868605817	0.04819277108433735
3	0.8996990972918756	0.09090909090909091
4	0.9087261785356068	0.042105263157894736
5	0.8866599799398195	0.13740458015267176
6	0.8996990972918756	0.12280701754385964
7	0.8756268806419257	0.1267605633802817
8	0.8876629889669007	0.0967741935483871
9	0.8736208625877633	0.1
10	0.8846539618856569	0.09448818897637795
11	0.8706118355065195	0.0979020979020979
12	0.8826479438314945	0.09302325581395349
13	0.8786359077231695	0.16551724137931034
14	0.8866599799398195	0.16296296296296298
15	0.8736208625877633	0.17105263157894737
16	0.8896690070210632	0.1791044776119403
17	0.8746238716148446	0.17218543046357615
18	0.8846539618856569	0.16058394160583941
19	0.872617853560682	0.16993464052287582
20	0.8826479438314945	0.18181818181818182
21	0.8686058174523571	0.1761006289308176
22	0.8816449348044132	0.1917808219178082
23	0.8696088264794383	0.17721518987341772
24	0.8786359077231695	0.1

### Наилучшее - Манхэттен k = 2

In [14]:
clf = KNeighborsClassifier(n_neighbors=2, metric='manhattan')
clf.fit(X_train_1, y_train)

from sklearn.metrics import accuracy_score, classification_report

predictions = clf.predict(X_test_1)
print('Accuracy:', accuracy_score(y_test, predictions) * 100)
print('Classification Report:\n', classification_report(y_test, predictions))

Accuracy: 92.07622868605817
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       953
           1       0.05      0.05      0.05        44

    accuracy                           0.92       997
   macro avg       0.50      0.50      0.50       997
weighted avg       0.92      0.92      0.92       997



## Дерево решений
### Нормализация нецелевых признаков

In [32]:
scaler_dt = StandardScaler()

X_train_2 = scaler_dt.fit_transform(X_train)
X_train_2 = pd.DataFrame(X_train_2, columns = scaler_dt.feature_names_in_)

X_test_2 = scaler_dt.transform(X_test)
X_test_2 = pd.DataFrame(X_test_2, columns =  scaler_dt.feature_names_in_)

### Обучаем дерево

In [33]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=15, random_state=10)
# обучаем
dt.fit(X_train_2, y_train)
# проверяем качество на тестовой выборке
dt.score(X_test_2, y_test)

0.8896690070210632

In [34]:
predictions = dt.predict(X_test_2)
print('Accuracy:', accuracy_score(y_test, predictions) * 100)
print('Classification Report:\n', classification_report(y_test, predictions))

Accuracy: 88.96690070210632
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94       953
           1       0.07      0.11      0.08        44

    accuracy                           0.89       997
   macro avg       0.51      0.52      0.51       997
weighted avg       0.92      0.89      0.90       997



## Случайный лес
### Нормализация нецелевых признаков

In [16]:
scaler_ss = StandardScaler()

X_train_3 = scaler_ss.fit_transform(X_train)
X_train_3 = pd.DataFrame(X_train_3, columns = scaler_ss.feature_names_in_)

X_test_3 = scaler_ss.transform(X_test)
X_test_3 = pd.DataFrame(X_test_3, columns =  scaler_ss.feature_names_in_)

### Обучаем лес

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier(n_estimators=100)
rf.fit(X_train_3, y_train)
rf.score(X_test_3, y_test)

0.9408224674022067

In [18]:
predictions = rf.predict(X_test_3)
print('Accuracy:', accuracy_score(y_test, predictions) * 100)
print('Classification Report:\n', classification_report(y_test, predictions))

Accuracy: 94.08224674022067
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       953
           1       0.11      0.05      0.06        44

    accuracy                           0.94       997
   macro avg       0.53      0.51      0.52       997
weighted avg       0.92      0.94      0.93       997



## Метод опорных векторов
### Нормализация

In [36]:
scaler_ma = MinMaxScaler(feature_range=(-1, 1))

X_train_4 = scaler_ma.fit_transform(X_train)
X_train_4 = pd.DataFrame(X_train_4, columns = scaler_mm.feature_names_in_)

X_test_4 = scaler_ma.transform(X_test)
X_test_4 = pd.DataFrame(X_test_4, columns =  scaler_mm.feature_names_in_)

### Обучаем

In [37]:
from sklearn.svm import SVC

svc=SVC()
svc.fit(X_train_4, y_train)
print(svc.score(X_test_4, y_test))

0.8716148445336008


In [39]:
predictions = svc.predict(X_test_4)
print('Accuracy:', accuracy_score(y_test, predictions) * 100)
print('Classification Report:\n', classification_report(y_test, predictions))

Accuracy: 87.16148445336009
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93       953
           1       0.14      0.36      0.20        44

    accuracy                           0.87       997
   macro avg       0.55      0.63      0.57       997
weighted avg       0.93      0.87      0.90       997

