In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
# Attribute Information
# 1) id: unique identifier
# 2) gender: "Male", "Female" or "Other"
# 3) age: age of the patient
# 4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
# 5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
# 6) ever_married: "No" or "Yes"
# 7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
# 8) Residence_type: "Rural" or "Urban"
# 9) avg_glucose_level: average glucose level in blood
# 10) bmi: body mass index
# 11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
# 12) stroke: 1 if the patient had a stroke or 0 if not
# *Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

Only bmi column has NaN values

In [None]:
data.info()

We have some missing values in 'bmi' column

In [None]:
print('------max values per columns-----')
print(data.apply(np.max))
print('------min values per columns-----')
print(data.apply(np.min))

Good way to look for invalid values (like: 'space' or other symbols)

In [None]:
data.gender.unique()

A bit suspicious gender 'other'

In [None]:
data[data.gender == 'Other']

Just 1 id, I'll drop it

In [None]:
data.drop(data[data.gender == 'Other'].index, inplace = True)
data.gender.unique()

In [None]:
data.stroke.value_counts().plot(kind='pie', autopct="%.2f", figsize=(6,6))

In [None]:
data.describe(include='float64')

In [None]:
fig, ax = plt.subplots(1,3, figsize=(12,6))
ax1 = plt.subplot(1,3,1)
sns.boxplot(y='age', data=data, ax=ax1).set(title = 'Age', ylabel='')
ax2 = plt.subplot(1,3,2)
sns.boxplot(y='avg_glucose_level', data=data, ax=ax2).set(title = 'Glucose level', ylabel='')
ax3 = plt.subplot(1,3,3)
sns.boxplot(y='bmi', data=data, ax=ax3).set(title = 'Bmi', ylabel='')
plt.show()

In [None]:
pd.crosstab(data.work_type, data.stroke).plot(kind='bar')
plt.show()

In [None]:
object_type_columns = data.select_dtypes(include='object')

Select "data.dtypes == object" for visual analise

In [None]:
object_type_columns.head()

Visualize object-type columns

In [None]:
fig = plt.figure(figsize=(20,12))
n = 1
for column in object_type_columns.columns:
    ax = plt.subplot(2,3,n).set(title=column)
    sns.countplot(x='stroke', hue=column, data=data)
    n += 1
plt.show()

Very unbalanced data in column 'stroke', do this again but only for 'stroke = 1'

In [None]:
fig = plt.figure(figsize=(20,12))
n = 1
stroke_1_data = data[data.stroke == 1]
for column in object_type_columns.columns:
    ax = plt.subplot(2,3,n).set(title=column)
    sns.countplot(x='stroke', hue=column, data=stroke_1_data)
    n += 1
plt.show()

It makes some difference

**Missing values**

I would like to change NaN values in 'bmi' column separetly for genders. And look at some features in 'bmi = NaN'

In [None]:
zero_data = data.fillna(0)

Changed "NaN" values to "0"

In [None]:
zero_bmi = zero_data[zero_data.bmi == 0]
zero_bmi

Only "bmi = 0" rows

In [None]:
# Nice way to have a dictionary with all names of columns separate by types.

# grouping_types = data.columns.to_series().groupby(data.dtypes).groups
# grouping_types
# d = {key.name: value for key, value in grouping_types.items()}
# d['int64'], d['float64'], d['object']
# for col_name in d['object']:
#     ... # etc.

In [None]:
obj_column_list = zero_bmi.select_dtypes(include=['object']).columns
obj_column_list

Exrtact only object-type columns

In [None]:
fig = plt.figure(figsize=(20, 12))
x = 1
for name_of_col in obj_column_list:
    ax = plt.subplot(2, 3, x)
    sns.countplot(x=name_of_col, data=zero_bmi)
    x += 1
plt.show()

We can see that larger group of people are "ever_married" is "Yes" and "work_type" is "Private". Let check the bmi.mean for different genders with this addition. And change missing values in bmi column

In [None]:
male_bmi_with_cond = data[(data.gender == 'Male') & (data.ever_married == 'Yes') & (data.work_type == 'Private')].bmi.mean().round(2)
print(male_bmi_with_cond, '- this is our "bmi" with conditions.')
print(data[(data.gender == 'Male')].bmi.mean().round(2), '- this is "bmi" mean for male')

In [None]:
female_bmi_with_cond = data[(data.gender == 'Female') & (data.ever_married == 'Yes') & (data.work_type == 'Private')].bmi.mean().round(2)
print(female_bmi_with_cond, '- this is our "bmi" with conditions.')
print(data[(data.gender == 'Female')].bmi.mean().round(2), '- this is "bmi" mean for female')

As we can see these meanings a bit different from just "mean". I will use them for main data

In [None]:
data = data.fillna(0)

In [None]:
data[data.gender == 'Male'] = data[data.gender == 'Male'].replace({'bmi': {0: male_bmi_with_cond}})

In [None]:
data[data.gender == 'Female'] = data[data.gender == 'Female'].replace({'bmi': {0: female_bmi_with_cond}})

In [None]:
data[data.bmi == 0]

That's it.

**Prediction (original data, undersampling, oversampling)**

In [None]:
plt.figure(figsize=(10,9))
sns.heatmap(data[['age', 'avg_glucose_level', 'bmi']].corr(), annot=True)
plt.show()

Correlation between cont values aren't observed

For prediction we have to convert object-type columns to numeric

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
new_data = data

In [None]:
new_data.head()

We have to convert all categorical columns to numeric data. I will use "Label Encoder" for it.

In [None]:
le = LabelEncoder()
for title in object_type_columns:
    new_data[title] = le.fit_transform(new_data[title])

"object_type_columns" is list with title of object columns. I used it earlier

In [None]:
new_data.head()

In [None]:
print('absolut values for whole data: ', Counter(new_data.stroke), sep='\n')
print()
print('in percents for whole data: ', new_data.stroke.value_counts(normalize=True).round(2) * 100, sep='\n')

In [None]:
# I will try to do prediction with unbalanced data and then do UnderSampling and OverSampling (SMOTE) to equal "stroke = 1" values in train data.
# And compare the scores

In [None]:
X = new_data.drop(['id', 'stroke'], axis=1)

Drop useless 'id' column

In [None]:
X.head()

In [None]:
y = new_data['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('absolut values for train set: ', Counter(y_train), sep='\n')
print()
print('in percents for train set: ', y_train.value_counts(normalize=True).round(2) * 100, sep='\n')

In [None]:
DT_clf = tree.DecisionTreeClassifier()
DT_clf.fit(X_train, y_train)

In [None]:
DT_prediction = DT_clf.predict(X_test)
DT_metrics = metrics.f1_score(y_test, DT_prediction).round(2)
DT_report = metrics.classification_report(y_test, DT_prediction)

In [None]:
metrics.plot_confusion_matrix(DT_clf, X_test, y_test, cmap='Blues');
plt.grid(False)
print(DT_report)

Accuracy score doesn't matter in this case. This model predict "true positive" cases terribly. F1-score and recall for "1" cases are bad. 
Maybe Random Forest does it better?

In [None]:
RF_clf = RandomForestClassifier()
RF_clf.fit(X_train, y_train)

In [None]:
RF_prediction = RF_clf.predict(X_test)
RF_metrics = metrics.f1_score(y_test, RF_prediction).round(2)
RF_report = metrics.classification_report(y_test, RF_prediction)

In [None]:
metrics.plot_confusion_matrix(RF_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(RF_report)

1 - 'true positive'. Disaster xD 

I think, it's enough. Now I will balance 0 and 1 levels in train data.

**UNDERSAMPLING**

Here I'm trying to make prediction with undersampling data. I will reduce train data, and have a look at our results

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()

In [None]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0)))

X_train_under, y_train_under = nm.fit_resample(X_train, y_train)

print('After Undersampling, the shape of train_X: {}'.format(X_train_under.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_under.shape))
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_under == 1)))
print("After Undersampling, counts of label '0': {}".format(sum(y_train_under == 0)))

In [None]:
under_DT_clf = tree.DecisionTreeClassifier()
under_DT_clf.fit(X_train_under, y_train_under)
under_DT_prediction = under_DT_clf.predict(X_test)
under_DT_report = metrics.classification_report(y_test, under_DT_prediction)

In [None]:
metrics.plot_confusion_matrix(under_DT_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(under_DT_report)

In [None]:
under_RF_clf = RandomForestClassifier()
under_RF_clf.fit(X_train_under, y_train_under)
under_RF_prediction = under_RF_clf.predict(X_test)
under_DT_report = metrics.classification_report(y_test, under_RF_prediction)

In [None]:
metrics.plot_confusion_matrix(under_RF_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(under_DT_report)

It's interesting. We see that undersample model give most mark "1" (1282) but 1217 is false. In contrast with raw data, there was mostly "0" mark. Let's check OverSampling

In [None]:
smote = SMOTE(random_state=42)

In [None]:
X_res_train, y_res_train = smote.fit_resample(X_train, y_train)
print('Original train dataset has %s' % Counter(y_train))
print('Resampled train dataset has %s' % Counter(y_res_train))

In [None]:
print('Original train dataset shape is', X_train.shape)
print('Resampled train dataset shape is', X_res_train.shape)
print()
print('in percents for train original set: ', y_train.value_counts(normalize=True).round(2) * 100, sep='\n')
print()
print('in percents for train resampled set: ', y_res_train.value_counts(normalize=True).round(2) * 100, sep='\n')


Ok, now it's good. Our data became bigger

In [None]:
smote_DT_clf = tree.DecisionTreeClassifier()
smote_DT_clf.fit(X_res_train, y_res_train)
smote_DT_prediction = smote_DT_clf.predict(X_test)
smote_DT_report = metrics.classification_report(y_test, smote_DT_prediction)

In [None]:
metrics.plot_confusion_matrix(smote_DT_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(smote_DT_report)

Much more prettier

In [None]:
smote_RF_clf = RandomForestClassifier()
smote_RF_clf.fit(X_res_train, y_res_train)
smote_RF_prediction = smote_RF_clf.predict(X_test)
smote_RF_report = metrics.classification_report(y_test, smote_RF_prediction)

In [None]:
metrics.plot_confusion_matrix(smote_RF_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(smote_RF_report)

In [None]:
smote_LR_clf = LogisticRegression()
smote_LR_clf.fit(X_res_train, y_res_train)
smote_LR_prediction = smote_LR_clf.predict(X_test)
smote_LR_report = metrics.classification_report(y_test, smote_LR_prediction)

In [None]:
metrics.plot_confusion_matrix(smote_LR_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(smote_LR_report)

In [None]:
smote_KN_clf = KNeighborsClassifier()
smote_KN_clf.fit(X_res_train, y_res_train)
smote_KN_prediction = smote_KN_clf.predict(X_test)
smote_KN_report = metrics.classification_report(y_test, smote_KN_prediction)

In [None]:
metrics.plot_confusion_matrix(smote_KN_clf, X_test, y_test, cmap='Blues')
plt.grid(False)
print(smote_KN_report)

**Summarize main metrics**

In [None]:
f1_score = [metrics.f1_score(y_test, smote_DT_prediction),
            metrics.f1_score(y_test, smote_RF_prediction),
            metrics.f1_score(y_test, smote_LR_prediction),
            metrics.f1_score(y_test, smote_KN_prediction),
            metrics.f1_score(y_test, DT_prediction),
            metrics.f1_score(y_test, RF_prediction),
            metrics.f1_score(y_test, under_DT_prediction),
            metrics.f1_score(y_test, under_RF_prediction)]

In [None]:
roc_score = [metrics.roc_auc_score(y_test, smote_DT_prediction), 
            metrics.roc_auc_score(y_test, smote_RF_prediction),
            metrics.roc_auc_score(y_test, smote_LR_prediction),
            metrics.roc_auc_score(y_test, smote_KN_prediction),
            metrics.roc_auc_score(y_test, DT_prediction),
            metrics.roc_auc_score(y_test, RF_prediction),
            metrics.roc_auc_score(y_test, under_DT_prediction),
            metrics.roc_auc_score(y_test, under_RF_prediction)]

In [None]:
precision = [metrics.precision_score(y_test, smote_DT_prediction), 
            metrics.precision_score(y_test, smote_RF_prediction),
            metrics.precision_score(y_test, smote_LR_prediction),
            metrics.precision_score(y_test, smote_KN_prediction),
            metrics.precision_score(y_test, DT_prediction),
            metrics.precision_score(y_test, RF_prediction),
            metrics.precision_score(y_test, under_DT_prediction),
            metrics.precision_score(y_test, under_RF_prediction)]

In [None]:
recall = [metrics.recall_score(y_test, smote_DT_prediction), 
            metrics.recall_score(y_test, smote_RF_prediction),
            metrics.recall_score(y_test, smote_LR_prediction),
            metrics.recall_score(y_test, smote_KN_prediction), 
            metrics.recall_score(y_test, DT_prediction),
            metrics.recall_score(y_test, RF_prediction),
            metrics.recall_score(y_test, under_DT_prediction),
            metrics.recall_score(y_test, under_RF_prediction)]

In [None]:
df = pd.DataFrame({'label':['Decision Tree', 'Random Forest', 'Logistic Regression', 'KNeighbors', 
                            'Decision Tree (raw)', 'Random Forest(raw)', 'Decision Tree (undersample)', 'Random Forest (undersample)'], 
                   'f1_score': f1_score, 'roc_score': roc_score, 'precision_score': precision, 'recall_score': recall})

In [None]:
df

**Observation:**
* Undersampled sets has best recall score, but terrible precision (even worse than raw data)
* ROC AUC score is higher in Logistic Regression and KNeighbors
* In my opinion, in the same tasks we need maximaze F1 score. Because it is serious medical task: predict srtoke for patient. We have to focus on recall (don't miss "false positive", - apparently ill man). And minimaze costs, if possible (don't expect any patient, - gain precision). 
* That's why "KNeighbors" fits better that others

In [None]:
from sklearn import preprocessing

array = df.iloc[:,1:].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(array)
normalize_results = pd.DataFrame(x_scaled, index=df.label, columns=df.iloc[:,1:].columns)

Normalize prediction results (the best result is "1", the worst - "0")

In [None]:
normalize_results.sort_values('f1_score', ascending=False)

**Summary:**
* Visualized some aspects of data
* Cleared missing values
* Has been prepared data for Machine Learning
* Looked on unbalanced data prediction, and did conclusion that this modeling is not valid
* Did undersampling and oversampling (SMOTE)
* Has been used different prediction models
* Given some final observation for choosing model for further tuning