In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style='darkgrid')
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read data

In [None]:
stroke = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke.head()

## Data info

In [None]:
stroke.info()

In [None]:
stroke.describe()

## Handling Missing Values

In [None]:
stroke.isnull().sum()

Column 'bmi' have 201 missing values

In [None]:
fig, ax = plt.subplots(figsize=(9,5))
sns.boxplot(data=stroke['bmi'])

Imputation

In [None]:
#BMI max. 50
stroke['bmi'] = stroke['bmi'].apply(lambda x: 50 if x > 50 else x)
stroke["bmi"] = stroke["bmi"].fillna(25)
stroke.isnull().sum()

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(stroke.corr(),annot=True);

# Exploratory Data Analysis

## Person with Stroke Disease

In [None]:
stroke_count = stroke['stroke'].value_counts()
print(stroke_count)
colors = ["darkgray","lightgrey"]
fig, ax = plt.subplots(figsize=(15,7))
ax.pie(stroke_count, autopct='%1.1f%%', colors=colors)
ax.set_title('Person With Stroke Disease', fontsize=15)
ax.legend(['No','Yes'],
          title='Survivor',
          bbox_to_anchor=(1,1))

## Gender Survivor and Stroke Status

In [None]:
stroke_gndr = stroke['gender'].value_counts()
stroke_gndr_1 = stroke.loc[stroke['stroke'] == 1]
stroke_gndr_1 = stroke_gndr_1['gender'].value_counts()
print(stroke_gndr)
print(stroke_gndr_1)

fig, ax = plt.subplots(1,2,figsize=(12,7))
fig.tight_layout(pad=9)
colors = ["palegoldenrod","khaki","darkkhaki"]
ax[0].pie(stroke_gndr, autopct='%1.1f%%', colors=colors)
ax[0].set_title('Gender Total')
ax[0].legend(['Female','Male','Other'],
          title='Gender',
          bbox_to_anchor=(1,1))

colors = ["skyblue","deepskyblue"]
ax[1].pie(stroke_gndr_1, autopct='%1.1f%%', colors=colors)
ax[1].set_title('Stroke Survivor Based on Gender')
ax[1].legend(['Female','Male','Other'],
          title='Gender',
          bbox_to_anchor=(1,1))

## Stroke Survivor Based on Age

In [None]:
def group(age):
    if age >= 0 and age <= 9:
        age = "0-10"
    elif age >= 10 and age <= 19:
        age = "10-19"
    elif age >= 20 and age <= 29:
        age = "20-29"
    elif age >= 30 and age <= 39:
        age = "30-39"
    elif age >= 40 and age <= 49:
        age = "40-49"
    elif age >= 50 and age <= 59:
        age = "50-59"
    elif age >= 60 and age <= 69:
        age = "60-69"
    elif age >= 70 and age <= 79:
        age = "70-79"
    elif age >= 80 and age <= 89:
        age = "80-89"
    
    return age
    
stroke_age_gndr = stroke.loc[:,['gender','age','stroke']]
stroke_age_gndr = stroke_age_gndr.loc[stroke_age_gndr['stroke'] == 1]
stroke_age_gndr['age'] = stroke_age_gndr['age'].apply(lambda x: group(x)) 

In [None]:
#ax = stroke_age_gndr.groupby(['age','gender']).count().unstack().plot.bar()
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(data=stroke_age_gndr.sort_values(by='age'), x='age', hue='gender')
ax.set_title('Stroke Survivor Based on Age', fontsize=15)
ax.set_ylabel('Survivor')
ax.set_xlabel('Age')

## Hypertension and Heart Disease of Stroke Survivor

In [None]:
stroke_new = stroke.loc[stroke['stroke'] == 1]
stroke_hpy = stroke_new['hypertension'].value_counts()
stroke_hea = stroke_new['heart_disease'].value_counts()
print(stroke_hea)
print(stroke_hpy)

fig, ax = plt.subplots(1, 2,figsize=(12,7))
fig.tight_layout(pad=9)
colors = ["wheat","orange"]
ax[0].pie(stroke_hpy, autopct='%1.1f%%', colors=colors)
ax[0].set_title('Stroke Survivor Hypertension')
ax[0].legend(['No','Yes'],
          title='Hypertension',
          bbox_to_anchor=(1,1))

colors = ["aquamarine","turquoise"]
ax[1].pie(stroke_hea, autopct='%1.1f%%', colors=colors)
ax[1].set_title('Stroke Survivor Heart Disease')
ax[1].legend(['No','Yes'],
          title='Heart Disease',
          bbox_to_anchor=(1,1))

## Married Status

In [None]:
stroke_em = stroke_new['ever_married'].value_counts()
print(stroke_em)

fig, ax = plt.subplots(figsize=(15,7))
colors = ["darkseagreen","palegreen"]
ax.pie(stroke_hpy, autopct='%1.1f%%', colors=colors)
ax.set_title('Stroke Survivor Married Status', fontsize=15)
ax.legend(['Yes','No'],
          title='Married Status',
          bbox_to_anchor=(1,1))

## Work and Residence Type

In [None]:
stroke_res = stroke_new['Residence_type'].value_counts()
print(stroke_res)

fig, ax = plt.subplots(1, 2,figsize=(12,7))
fig.tight_layout(pad=9)
colors = ["darksalmon","coral","tomato","mistyrose"]
sns.countplot(ax=ax[0], data=stroke_new, x='work_type')
ax[0].set_title('Stroke Survivor Work Type')
ax[0].set_xlabel('Work Type')
ax[0].set_ylabel('Count')

colors = ["plum","violet"]
ax[1].pie(stroke_res, autopct='%1.1f%%', colors=colors)
ax[1].set_title('Stroke Survivor Residence Type')
ax[1].legend(['Urban','Rural'],
          title='Residence Type',
          bbox_to_anchor=(1,1))

## Distribusion Avg Glucose Lv and BMI

In [None]:
fig, ax = plt.subplots(1, 2,figsize=(12,7))
sns.histplot(ax=ax[0], data = stroke_new['avg_glucose_level'])
ax[0].set_title('Stroke Survivor Average Glucose Level')
ax[0].set_xlabel('Average Glucose Level')

sns.histplot(ax=ax[1], data = stroke['bmi'])
ax[1].set_title('Stroke Survivor BMI')
ax[1].set_xlabel('BMI')

## Smoking Status

In [None]:
sns.countplot(data=stroke_new, x='smoking_status')
plt.title('Stroke Survivor Smoking Status')
plt.ylabel('Count')
plt.xlabel('Smoking Status')

# Stroke Disease Predict

In [None]:
stroke_predict = stroke.copy()
stroke_predict = stroke_predict.drop(['id'], axis=1)
stroke_predict.info()

## Encoding

In [None]:
stroke_predict['gender'] = stroke_predict['gender'].apply(lambda x: 1 if x == 'Female' else 0)
stroke_predict['ever_married'] = stroke_predict['ever_married'].apply(lambda x: 1 if x == 'Yes' else 0)
stroke_predict['Residence_type'] = stroke_predict['Residence_type'].apply(lambda x: 1 if x == 'Urban' else 0)
stroke_predict = pd.get_dummies(data=stroke_predict, columns=['smoking_status'])
stroke_predict = pd.get_dummies(data=stroke_predict, columns=['work_type'])

## Scaling

In [None]:
scaler = StandardScaler()
stroke_predict['age'] = scaler.fit_transform(stroke_predict[['age']])
stroke_predict['avg_glucose_level'] = scaler.fit_transform(stroke_predict[['avg_glucose_level']])
stroke_predict['bmi'] = scaler.fit_transform(stroke_predict[['bmi']])

stroke_predict.head()

## Build Model

In [None]:
X = stroke_predict.drop(['stroke'], axis=1)
y = stroke_predict['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Fit Model

#### Decision Tree

In [None]:
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score1 = model.score(X_train, y_train)
test_score1 = model.score(X_test, y_test) 

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

#### Logistic Regression

In [None]:
model = LogisticRegression()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score2 = model.score(X_train, y_train)
test_score2 = model.score(X_test, y_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

#### Random Forest

In [None]:
model = RandomForestClassifier()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score3 = model.score(X_train, y_train)
test_score3 = model.score(X_test, y_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

#### XGB Classifier

In [None]:
model = XGBClassifier()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score4 = model.score(X_train, y_train)
test_score4 = model.score(X_test, y_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

#### Gradient Boosting Classifier

In [None]:
model = GradientBoostingClassifier()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score5 = model.score(X_train, y_train)
test_score5 = model.score(X_test, y_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

#### Bagging Classifier

In [None]:
model = BaggingClassifier()
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Training Score: ", model.score(X_train, y_train))
print("Test Score: ", model.score(X_test, y_test))
train_score6 = model.score(X_train, y_train)
test_score6 = model.score(X_test, y_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)
print(cr)

# Result (Accuracy and Training Score)

In [None]:
print("Decision Tree\nTraining Score:", train_score1, "\nAccuracy:", test_score1)
print("\nLogistic Reggresion\nTraining Score:", train_score2, "\nAccuracy:", test_score2)
print("\nRandom Forest\nTraining Score:", train_score3, "\nAccuracy:", test_score3)
print("\nXGB Classifier\nTraining Score:", train_score4, "\nAccuracy:", test_score4)
print("\nGradient Boosting Classifier\nTraining Score:", train_score5, "\nAccuracy:", test_score5)
print("\nBagging Classifier\nTraining Score:", train_score6, "\nAccuracy:", test_score6)