In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

sns.set_theme(style="darkgrid")

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df

In [None]:
df.info()

# Data Preprocessing

In [None]:
df['stroke'] = df['stroke'].astype("int64")
df['hypertension'] = df['hypertension'].astype("int64")
df['heart_disease'] = df['heart_disease'].astype("int64")

In [None]:
numeric = ['age', 'avg_glucose_level', 'bmi']

In [None]:
df.describe()

In [None]:
df[['gender', 'id']].groupby('gender').count().reset_index()

In [None]:
df = df[df['gender'] != 'Other']

In [None]:
bmi_mean = df['bmi'].mean()
df['bmi'] = df['bmi'].apply(lambda x: bmi_mean if np.isnan(x) else x)
df['bmi'].head()

# Data Composition

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,5))
sns.countplot(x='gender', data=df, ax=ax[0])
ax[0].set(xlabel=None, title='Gender')
sns.histplot(data=df, x="age", hue="gender", multiple ='stack', ax=ax[1])
ax[1].set(xlabel=None, title='Age')

We have more Female's data than Male's with age range between 0 - 80 years old.

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12,7), sharex=True)
fig.suptitle("People Characteristic")
sns.countplot(y='work_type', data=df, ax=ax[0,0])
ax[0,0].set(xlabel=None, ylabel=None, title='Work Type', yticklabels=['Private', 'Self-employed', 'Govt Job', 'Children', 'Never Worked'])
sns.countplot(y='smoking_status', data=df, ax=ax[0,1])
ax[0,1].set(xlabel=None, ylabel=None, title='Smoking Status', yticklabels=['Formerly Smoked', 'Never Smoked', 'Smokes', 'Unknown'])
sns.countplot(y='ever_married', data=df, ax=ax[1,0])
ax[1,0].set(xlabel=None, ylabel=None, title='Marriage Status', yticklabels=['Married', 'Single'])
sns.countplot(y='Residence_type', data=df, ax=ax[1,1])
ax[1,1].set(xlabel=None, ylabel=None, title='Residence Type', yticklabels=['Urban', 'Rural'])
fig.tight_layout()

There are 4 categories in this data <br>
Work Type       : Private is the one with most data. <br>
Smoking Status  : Never Smoked is number one with most data, then Unknown as second. <br>
Marriage Status : We have more married's data than single's. <br>
Residence Type  : Almost balance in residence type data.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10, 4), sharey=True)
sns.kdeplot(df['bmi'], shade=True, ax=ax[0])
ax[0].set(xlabel=None, title='BMI')
sns.kdeplot(df['avg_glucose_level'], shade=True, ax=ax[1])
ax[1].set(xlabel=None, title='Glucose Level')
fig.tight_layout()

We have highest density BMI in around 30, and Glucose Levl in around 75.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10, 6), sharey=True)
sns.boxplot(x='gender', y='age', data=df[df['hypertension']==1], ax=ax[0])
ax[0].set(xlabel=None, title='Hypertension')
sns.boxplot(x='gender', y='age', data=df[df['heart_disease']==1], ax=ax[1])
ax[1].set(xlabel=None, ylabel=None, title='Heart Disease')
fig.tight_layout()

Female with Hypertension between 51 - 72 years old, and Male's between 52 - 77 years old. <br>
In Heart Diease,  Female between 63 - 79 years old, and Male's between 60 - 78 years old.

# Visualization of Stroke's Data

In [None]:
stroke_df = df[df['stroke']==1]
stroke_df

In [None]:
stroke_df_by_gender = stroke_df.groupby('gender').count()[['stroke']].reset_index()
df_by_stroke = df.groupby('stroke').count()[['id']].reset_index()
stroke_colors =[px.colors.qualitative.Set2[7], px.colors.qualitative.Dark2[7]]
gender_colors = [px.colors.qualitative.Plotly[1], px.colors.qualitative.Plotly[0]]

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('', 'Stroke data Gender-Wised'),
    specs=[[{'type':'domain'}, {'type':'bar'}]]
)

fig.add_trace(
    go.Pie(labels=['Healthy','Stroke'],
           values=df_by_stroke['id'],
           hoverinfo='percent+label+value',
           marker=dict(colors=stroke_colors),
           rotation=100
           ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(y=stroke_df_by_gender['gender'], 
           x=stroke_df_by_gender['stroke'],
           texttemplate='%{x} Cases',
           textposition ='inside',
           orientation='h',
           hoverinfo='x+y',
           marker=dict(color=gender_colors)),
    row=1, col=2
)

fig.update_layout(height=600, width=800, 
                  showlegend=False,
                  title_text="Stroke Count",
                 )
fig.show()

In [None]:
labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90']
male_stroke_df = stroke_df[stroke_df['gender']=='Male']
male_stroke_df = male_stroke_df.groupby(pd.cut(male_stroke_df["age"], np.arange(0, 100, 10),labels=labels)).size().reset_index(name='stroke')

female_stroke_df = stroke_df[stroke_df['gender']=='Female']
female_stroke_df = female_stroke_df.groupby(pd.cut(female_stroke_df["age"], np.arange(0, 100, 10),labels=labels)).size().reset_index(name='stroke')


fig = go.Figure()
fig.add_trace(go.Bar(
    x=male_stroke_df['age'],
    y=male_stroke_df['stroke'],
    name='Male',
    marker_color=px.colors.qualitative.Plotly[0]
))
fig.add_trace(go.Bar(
    x=female_stroke_df['age'],
    y=female_stroke_df['stroke'],
    name='Female',
    marker_color=px.colors.qualitative.Plotly[1]
))


fig.update_layout(barmode='group', title='Stroke data Age-Wised')
fig.show()

Most cases happen to who is around the age of 70-80 for both gender.

In [None]:
parameter = pd.get_dummies(df[['gender','work_type', 'Residence_type', 'smoking_status', 'ever_married']])
parameter['stroke'] = df['stroke']
count_by_parameter = parameter.groupby('stroke').sum().transpose()
count_by_parameter.columns = ['non_stroke','stroke']
count_by_parameter = count_by_parameter.reset_index()
count_by_parameter['p_stroke'] = count_by_parameter['stroke']/(count_by_parameter['non_stroke']+count_by_parameter['stroke']) * 100
count_by_parameter['index'] = ['Female', 'Male', 'Govt Job', 'Never Worked', 'Private', 'Self-employed', 'Children', 'Rural', 'Urban', 'Unknown', 'Formely Smoked', 'Never Smoked', 'Smokes', 'Single', 'Married']

In [None]:
count_by_parameter

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Work Type', 'Residence Type', 'Smoking Status', 'Marriage Status'),
    specs=[[{'type':'bar'}, {'type':'bar'}], [{'type':'bar'}, {'type':'bar'}]],
    shared_xaxes=True,
)
for x in ['non_stroke', 'stroke']:
    if x == 'non_stroke':
        m_color = px.colors.qualitative.Plotly[2]
    else:
        m_color = px.colors.qualitative.Plotly[1]
        
    fig.add_trace(
        go.Bar(y=count_by_parameter['index'][2:7],
               x=count_by_parameter[x][2:7],
               marker_color = m_color,
               name= x,
               orientation='h',
              ), 
        row=1, col=1
    )

    fig.add_trace(
        go.Bar(y=count_by_parameter['index'][7:9],
               x=count_by_parameter[x][7:9],
               marker_color = m_color,
               name= x,
               orientation='h',
              ),
        row=1, col=2
    )

    fig.add_trace(
        go.Bar(y=count_by_parameter['index'][9:13],
               x=count_by_parameter[x][9:13],
               marker_color = m_color,
               name= x,
               orientation='h',
              ),
        row=2, col=1
    )

    fig.add_trace(
        go.Bar(y=count_by_parameter['index'][13:15],
               x=count_by_parameter[x][13:15],
               marker_color = m_color, 
               name= x,
               orientation='h',
              ),
        row=2, col=2
    )


fig.update_layout(barmode='stack', title='Stroke Cases Category-Wised', showlegend=False
                 )

People with private job have most cases, 149. <br>
People live in urban residence are more risk than live in rural residence. <br>
Never smoked people have most cases, 90. <br>
People who is married are more risk than who is single.

In [None]:
stroke_df = df[df['stroke']==1]
non_stroke_df = df[df['stroke']==0]

fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type':'xy'}, {'type':'xy'}]],
    subplot_titles=['Avg Glucose Level','BMI']
)

fig.add_trace(
    go.Box(x=non_stroke_df['avg_glucose_level'],
           boxpoints='all', 
           name='Healthy',
           marker=dict(color=px.colors.qualitative.Plotly[2])), 
    row=1, col=1
)

fig.add_trace(
    go.Box(x=stroke_df['avg_glucose_level'],
           boxpoints='all',
           name='Stroke',
           marker=dict(color=px.colors.qualitative.Plotly[1])), 
    row=1, col=1
)

fig.add_trace(
    go.Box(x=non_stroke_df['bmi'],
           boxpoints='all',
           name='Healthy',
           marker=dict(color=px.colors.qualitative.Plotly[2])), 
    row=1, col=2
)

fig.add_trace(
    go.Box(x=stroke_df['bmi'],
           boxpoints='all',
           name='Stroke',
           marker=dict(color=px.colors.qualitative.Plotly[1])), 
    row=1, col=2
)

fig.update_layout(showlegend=False)
fig.show()

Distribution of people affected by stroke is wider than health's in terms of average glucose level values. <br>
But in terms of BMI values, is quite narrow.

In [None]:
data = df[['hypertension', 'heart_disease', 'stroke']]
data.columns = ['Hypertension', 'Heart Disease', 'Stroke']
fig = px.parallel_categories(data, color='Stroke',  color_continuous_scale=px.colors.sequential.Brwnyl,
                             title = 'Stroke data Disease-Wised')
fig.update_layout(coloraxis_showscale=False)
fig.show()

The largest contributor to people who have a stroke is from people who do not have hypertension and heart disease (249 people).

In [None]:
sns.pairplot(df[numeric + ['stroke']], hue='stroke')

# Feature Enginering

In [None]:
df['ever_married'] = df['ever_married'].apply(lambda x : 1 if x == 'Yes' else 0)

In [None]:
work_type = pd.get_dummies(df['work_type'])
residence_type = pd.get_dummies(df['Residence_type'])
smoking_status = pd.get_dummies(df['smoking_status'])
gender = pd.get_dummies(df['gender'])
df = df.join(work_type)
df = df.join(residence_type)
df = df.join(smoking_status)
df = df.join(gender)
df = df.drop(['work_type', 'Residence_type', 'smoking_status', 'gender'], axis=1)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True, fmt='.2f')

# Oversampling

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
balance = SMOTE(sampling_strategy = 'auto', random_state = 0, k_neighbors = 5)
X, y = balance.fit_resample(df.drop(['id'], axis =1), df['stroke'])

In [None]:
y.value_counts()

# Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Decision Tree

In [None]:
from sklearn import tree

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
print('Train set:\n') #performance on training set
y_predict = clf.predict(X_train)
print(classification_report(y_train, y_predict))

acc_score_train = np.round(accuracy_score(y_train, y_predict), 4)
precision_score_train = np.round(precision_score(y_train, y_predict), 4)

print('\nTest set:\n') #performance on test set
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))

acc_score_test = np.round(accuracy_score(y_test, y_predict), 4)
precision_score_test = np.round(precision_score(y_test, y_predict), 4)

In [None]:
confusion_matrix(y_test, y_predict)

# Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
catboost = CatBoostClassifier()
catboost.fit(X_train,y_train)

In [None]:
print('Train set:\n') #performance on training set
y_predict = catboost.predict(X_train)
print(classification_report(y_train, y_predict))

acc_score_train = np.round(accuracy_score(y_train, y_predict), 4)
precision_score_train = np.round(precision_score(y_train, y_predict), 4)

print('\nTest set:\n') #performance on test set
y_predict = catboost.predict(X_test)
print(classification_report(y_test, y_predict))

acc_score_test = np.round(accuracy_score(y_test, y_predict), 4)
precision_score_test = np.round(precision_score(y_test, y_predict), 4)

In [None]:
confusion_matrix(y_test, y_predict)