In [None]:
%config Completer.use_jedi = False

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16, 9))

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Read and check informations about data

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col='id')
data.head()

In [None]:
data.describe()

In [None]:
data.info()

**Now let's see missing values**

In [None]:
data.isnull().sum()

In [None]:
percent = ((data['bmi'].isnull().sum() / data.shape[0]) * 100).round(2)
percent

**As we can see 4% of bmi data is missing**

**I don't want to drop missing rows or column, therefore I fill these NA places with values**

In [None]:
data_filled = data.fillna(method='bfill')
#data_filled = data.fillna(data.mean())
#data_filled = data.fillna(data.median())
data_filled.isnull().sum()

In [None]:
s = (data_filled.dtypes == 'object')
cat_cols = list(s[s].index)

#num_cols = list(data_filled.select_dtypes(exclude=['object']))
num_cols = ['age', 'bmi', 'avg_glucose_level']

cat_cols

In [None]:
sns.pairplot(data[num_cols])

In [None]:
low_cardinality_cols = [col for col in cat_cols if data_filled[col].nunique() < 10]
low_cardinality_cols

In [None]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_data = pd.DataFrame(OH_encoder.fit_transform(data_filled[cat_cols]))

OH_cols_data.index = data_filled.index
OH_cols_data

data_num = data_filled.drop(cat_cols, axis=1)
OH_data = pd.concat([data_num, OH_cols_data], axis=1)
OH_data

Check outliers

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(data=OH_data[num_cols])

**Removing outliers**

In [None]:
Q1 = OH_data[num_cols].quantile(0.25)
Q3 = OH_data[num_cols].quantile(0.75)
IQR = Q3 - Q1

data_out = OH_data[~((OH_data[num_cols] < (Q1 - 1.5 * IQR)) |(OH_data[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

plt.figure(figsize=(16, 10))
sns.boxplot(data=data_out)

In [None]:
data_out

**Splitting the data to train set and test set**

In [None]:
X = data_out.drop('stroke', axis=1)
y = data_out.loc[:, 'stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

****Scaling****

In [None]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

In the next step we will use a for loop to find the best hyperparameter.
In the future, we can use gridsearch for that.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_range = range(1, 26)
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores_list.append(accuracy_score(y_test, y_pred))

In [None]:
scores_list

In [None]:
plt.figure(figsize=(16, 10))
plt.plot(k_range, scores_list)
plt.xlabel('Values of K')
plt.ylabel('Accuracy score')
plt.show()

In [None]:
max(scores_list)

In [None]:
scores_list.index(max(scores_list))

The best K value is 4 (3 + 1 because loop started from 1 and not 0)

In [None]:
knn = KNeighborsClassifier(4)
knn.fit(X_train, y_train)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1)

tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
score = accuracy_score(y_test, y_pred)
score

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 1000, random_state=1)

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
score = accuracy_score(y_test, y_pred)
score

Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)
score = accuracy_score(y_test, y_pred)
score