In [None]:
import pandas as pd
import numpy as np


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split 

from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm.notebook import tqdm
from catboost import Pool, CatBoostClassifier
from sklearn import model_selection


import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
df.dtypes

First of all let's look for NaN in data

In [None]:
colours = ['#000099', '#ffff00'] 
plt.figure(figsize = (15,5))
sns.heatmap(df.isnull(), cmap = sns.color_palette(colours))
plt.show()

We have some missing values in the bmi index. In order not to delete lines, fill them with the average value.

In [None]:
df = df.fillna(df['bmi'].mean())

In [None]:
df.shape

Let's take a quick look at the numeric variables. 

In [None]:
df[['age', 'avg_glucose_level', 'bmi']].describe()

In [None]:
df_age = df.pivot_table('age', index='gender', aggfunc=['max', 'min', 'mean', 'median'])

In [None]:
df_age

In [None]:
df_age_count = df['age'].value_counts()

In [None]:
fig = go.Figure(data=[go.Bar(
    x=df['age'].value_counts().index,
    y=df['age'].value_counts().values,
    width=0.8,
)])

fig.update_layout(title = 'Count of respondents by age')
fig.update_layout(title_x = 0.5)


fig.show()

In [None]:
df_stroke = df.pivot_table('age', index='stroke', aggfunc=['max', 'min', 'mean', 'median'])
df_stroke

In [None]:
df_stroke_count = df.loc[df['stroke'] == 1]
df_nostroke_count = df.loc[df['stroke'] == 0]

In [None]:
fig = go.Figure(data=[go.Bar(
    x=df_stroke_count['age'].value_counts().index,
    y=df_stroke_count['age'].value_counts().values,
    width=0.8
)])

fig.update_layout(title = 'Count of strokes by age')
fig.update_layout(title_x = 0.5)


fig.show()

In [None]:
fig = go.Figure([go.Bar(
    x=df['stroke'].value_counts().index, 
    y=df['stroke'].value_counts().values)])
fig.show()

In [None]:
fig = go.Figure([go.Bar(
    x=df_stroke_count['gender'].value_counts().index, 
    y=df_stroke_count['gender'].value_counts().values)])
fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['hypertension'].value_counts().index, 
        y=df_stroke_count['hypertension'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['hypertension'].value_counts().index, 
        y=df_nostroke_count['hypertension'].value_counts().values)]
    )

fig.update_layout(title = 'Count of people with (1)/ without (0) hypertension')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['heart_disease'].value_counts().index, 
        y=df_stroke_count['heart_disease'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['heart_disease'].value_counts().index, 
        y=df_nostroke_count['heart_disease'].value_counts().values)]
    )

fig.update_layout(title = 'Count of people with (1)/ without (0) heart diseases')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['work_type'].value_counts().index, 
        y=df_stroke_count['work_type'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['work_type'].value_counts().index, 
        y=df_nostroke_count['work_type'].value_counts().values)]
    )

fig.update_layout(title = 'People by work types')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['ever_married'].value_counts().index, 
        y=df_stroke_count['ever_married'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['ever_married'].value_counts().index, 
        y=df_nostroke_count['ever_married'].value_counts().values)]
    )

fig.update_layout(title = 'Is ever married')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['Residence_type'].value_counts().index, 
        y=df_stroke_count['Residence_type'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['Residence_type'].value_counts().index, 
        y=df_nostroke_count['Residence_type'].value_counts().values)]
    )

fig.update_layout(title = 'Is urban/rural')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = px.histogram(df_stroke_count, x = df_stroke_count['avg_glucose_level'], nbins=100)
fig.update_layout(title = 'Glucose level')
fig.update_layout(title_x = 0.5)

fig.show()


In [None]:
fig = px.histogram(df_nostroke_count, x = df_nostroke_count['avg_glucose_level'], nbins=1000)
fig.update_layout(title = 'Glucose level')
fig.update_layout(title_x = 0.5)

fig.show()



In [None]:
fig = px.histogram(df_stroke_count, x = df_stroke_count['bmi'], nbins=70)
fig.update_layout(title = 'bmi')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = px.histogram(df_nostroke_count, x = df_nostroke_count['bmi'], nbins=70)
fig.update_layout(title = 'bmi')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = px.scatter(df_stroke_count, x="bmi", y="avg_glucose_level", color = 'gender', size = 'age', height=1000)
fig.show()

In [None]:
fig = go.Figure(
    [go.Bar(
        name='Stroke',
        x=df_stroke_count['smoking_status'].value_counts().index, 
        y=df_stroke_count['smoking_status'].value_counts().values),
    go.Bar(
        name='No stroke', 
        x=df_nostroke_count['smoking_status'].value_counts().index, 
        y=df_nostroke_count['smoking_status'].value_counts().values)]
    )

fig.update_layout(title = 'Smoking status')
fig.update_layout(title_x = 0.5)

fig.show()

In [None]:
fig = px.pie(values=df_stroke_count['smoking_status'].value_counts().values, 
                names=df_stroke_count['smoking_status'].value_counts().index)
fig.show()

Let's look at the correlation of features to the target variable

Converting text columns to numeric for further work

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['gender'] = labelencoder.fit_transform(df['gender'])
df['ever_married'] = labelencoder.fit_transform(df['ever_married'])
df['work_type'] = labelencoder.fit_transform(df['work_type'])
df['Residence_type'] = labelencoder.fit_transform(df['Residence_type'])
df['smoking_status'] = labelencoder.fit_transform(df['smoking_status'])

In [None]:
from termcolor import colored
from scipy.stats import kendalltau
    
for i in df.columns:
    coef, p = kendalltau(df[i], df['stroke'])
    print(colored(f"{i}", 'blue'))
    if coef > 0.6:
        print(colored('Kendall correlation coef more than 0.6: %.3f' % coef, 'yellow'))
    else:
        print('Kendall correlation coef less than 0.6: %.3f' % coef)
    alpha = 0.05
    if p > alpha:
        print(colored('Variables are not correlated p=%.3f' % p, 'red'))
    else:
        print(colored('Variables are correlated p=%.3f' % p, 'green'))

In [None]:
correlation = df.corr(method="kendall")

In [None]:
fig = plt.figure(figsize = (12,12))
sns.heatmap(correlation, annot=True, cmap="RdYlGn", vmin=-1, vmax=+1)
plt.title('Kendall Correlation')
plt.show()

Conclusions on intelligence analysis:

    Although we have a sample of more than 5,000 respondents, the target variable is unevenly distributed. We have very few cases of stroke from the total number of observations. This poses the first problem - even with a reasonably good predictive model, it will almost certainly be good at predicting when a person will not have a stroke, and vice versa.
    The second problem is not a very clear correlation between signs. There is no clear picture of which of these influences the target variable more. Obviously, it must be age and medical feauters.

To build the model, I will use CatBoostClassifier. Our task is to obtain the largest metric "recall" to minimize "false negative" results. Evaluation using F1-Score (given the output class imbalance)

In [None]:
df.columns

In [None]:
train, valid = model_selection.train_test_split(df,
                                                test_size=0.20, 
                                                stratify=df['stroke'], 
                                                shuffle=True, random_state=10)

In [None]:
feature_names = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

target = 'stroke'

In [None]:
model = CatBoostClassifier(
    verbose=50,
    loss_function='Logloss',
    eval_metric='TotalF1',
    task_type="CPU",
    iterations=1000,
    learning_rate=0.2,            
)

In [None]:
model.fit(
    train[feature_names], train[target],
    eval_set=(valid[feature_names], valid[target]),
    plot=True
)

In [None]:
print(classification_report(valid.stroke.values, model.predict(valid[feature_names])))

As expected, the model is fairly accurate, but works very poorly for identifying people who have had a stroke. Thus, it does not serve the main purpose of our study. In this case, having no other data, the only method is to artificially increase our sample by the SMOTE method.

In [None]:
X = df.loc[:, df.columns != 'stroke']
y = df.loc[:, df.columns == 'stroke']

from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)
columns = X_train.columns

os_data_X,os_data_y=os.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['stroke'])

print("Data shape",len(os_data_X))
print("Stroke 0 shape",len(os_data_y[os_data_y['stroke']==0]))
print("Stroke 1 shape",len(os_data_y[os_data_y['stroke']==1]))

In [None]:
X = os_data_X
y = os_data_y

X = X.join(y)

In [None]:
train, valid = model_selection.train_test_split(X,
                                                test_size=0.20, 
                                                stratify=X['stroke'], 
                                                shuffle=True, random_state=10)

In [None]:
feature_names = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

target = 'stroke'

In [None]:
model.fit(
    train[feature_names], train[target],
    eval_set=(valid[feature_names], valid[target]),
    plot=True
)

In [None]:
print(classification_report(valid.stroke.values, model.predict(valid[feature_names])))

Let's look at the importance of the parameters

In [None]:
importance = model.get_feature_importance(prettified=True)

fig = go.Figure([go.Bar(
    x=importance['Feature Id'], 
    y=importance['Importances'])])
fig.show()