# Our  goal is to predict whether a given person had a stroke or not
## We can see from the description of data that we have a lot of cases when a given person didn't have a stroke 
### So our main goal will be to build machine learning algorithm that will in some cases predict that a given person had a stroke even if we get a little bit lower prediction success rate (we can get a very high prediction rate if we predict that everyone didn't have a stroke)

## Reading data

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum().sum()

In [None]:
# There are some missing values that we need to take care for
# Let's start by dropping the id column because it doesn't give any info related to strokes
df.drop('id', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['stroke'].sum()

In [None]:
# So we mostly have cases that didn't result in a stroke

# Fill the missing bmi values

In [None]:
df['bmi'].hist(bins=40)

In [None]:
# Most people have bmi ranging from 15 to 40
# I think that the best way to fill bmi is by using the average value of bmi for every age and gender
mean_bmi = df.groupby(['age', 'gender']).mean()['bmi']
mean_bmi.head()

In [None]:
import math
def fill_bmi(df, mean_bmi):
    if math.isnan(df['bmi']): 
        return mean_bmi[df['age']][df['gender']]
    else:
        return df['bmi']
    
df['bmi'] = df.apply(fill_bmi, axis=1, args=(mean_bmi, ))
df.isnull().sum().sum()

In [None]:
# There is one missing value. We need to check why we still have one missing value
df.info()

In [None]:
df[df['bmi'].isnull()]

In [None]:
print(str(mean_bmi[0.48]['Male']) + '\n')
print(mean_bmi[0.48])

In [None]:
# We can see that this male is the only male at a given age so the mean in his age group is equal to Nan
# We will manually assign the value for this male by using female bmi mean for this age group
df.loc[2030, 'bmi'] = mean_bmi[0.48]['Female']

In [None]:
df.isnull().sum().sum()

In [None]:
df.info()

In [None]:
df['bmi'].hist(bins=40)

# Exploratory data analysis

In [None]:
df.head()

In [None]:
sns.pairplot(df)

In [None]:
sns.heatmap(df.corr())

In [None]:
df['gender'].value_counts()

In [None]:
# Time to verify what this other gender means
df[df['gender'] == 'Other']

In [None]:
# The best thing we can do here is to get drop this row
df.drop([3116], inplace=True)
df['gender'].value_counts()

In [None]:
sns.set(font_scale=1)
plt.figure(figsize=(12, 6))
ax = sns.countplot(data=df, x='stroke', hue='gender')
plt.title('Distribution of strokes based on gender')
plt.xlabel('Stroke')
plt.ylabel('How many people had a stroke')
for column in ax.patches:
    ax.annotate(column.get_height(), (column.get_x() + 0.15, column.get_height() + 50))
plt.ylim(0, 3100)
plt.show()

In [None]:
data = df.groupby(['gender', 'stroke']).count()['age']
print('Percentage of female that had a stroke: ', (data['Female'][1]/(data['Female'][0] + data['Female'][1])) * 100)
print('Percentage of male that had a stroke: ', (data['Male'][1]/(data['Male'][0] + data['Male'][1])) * 100)

We can see that men have higher probability to get a stroke than women but not by much

In [None]:
df['age'].hist(bins=40)

In [None]:
number_of_strokes = df.groupby('age').sum()
whole_population = df.groupby('age').count()
percentage_had_a_stroke = (number_of_strokes['stroke']/whole_population['stroke']) * 100

plt.figure(figsize=(12, 8))
plt.plot(percentage_had_a_stroke.index, percentage_had_a_stroke.values)
plt.title('Distribution of strokes based on age', fontsize=25)
plt.xlabel('Age', fontsize=15)
plt.ylabel('Percentage of people that had a stroke', fontsize=15)

plt.show()

We can see that the older we get the more likely we are to have a stroke

In [None]:
def get_ilnesses_of_given_patient(df):
    if df['heart_disease'] == 1 and df['hypertension'] == 1:
        return 'Heart disease and hypertension'
    elif df['heart_disease'] == 1:
        return 'Heart disease'
    elif df['hypertension'] == 1:
        return 'Hypertension'
    else:
        return 'No ilness'

sns.set(font_scale=2)
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(18, 24))
ax1.set_ylim(0, 5000)
ax2.set_ylim(0, 5000)
ax3.set_ylim(0, 5000)
fig.suptitle('Distribution of strokes based on other ilnesses', fontsize=36)

sns.countplot(data=df, x='hypertension', hue='stroke', ax=ax1)
sns.countplot(data=df, x='heart_disease', hue='stroke', ax=ax2)

data = df[['heart_disease', 'hypertension', 'stroke']] 
data = data.assign(ilness = data.apply(get_ilnesses_of_given_patient, axis=1))
sns.countplot(data=data, x='ilness', hue='stroke', ax=ax3)

for column in ax1.patches:
    ax1.annotate(column.get_height(), (column.get_x() + 0.15, column.get_height() + 50))
    
for column in ax2.patches:
    ax2.annotate(column.get_height(), (column.get_x() + 0.15, column.get_height() + 50))
    
for column in ax3.patches:
    ax3.annotate(column.get_height(), (column.get_x() + 0.15, column.get_height() + 50))

fig.tight_layout()
fig.subplots_adjust(top=0.94)
plt.show()

We can see that people that had ilnesses are much more likely to have a stroke. 1/4 of people that have both hypertension and heart disease had a stroke, 1/6 of people that had heart disease had a stroke and 1/8 of people that had hypertension had a stroke. This is a lot compared to around 3,5% of people that had a stroke but were not ill beforehand

In [None]:
# Reseting the font scale for feature
sns.set(font_scale=1)

In [None]:
df['ever_married'].value_counts()

In [None]:
df.groupby('ever_married').describe()['age']

In [None]:
# We can see that this data also include kids that cannot get married
# To see whether being married is correlated to strokes we will analyse this data but with relation to age
plt.figure(figsize=(12, 8))
sns.lineplot(data=df, x='age', y='stroke', hue='ever_married')
plt.ylabel('Chance of having a stroke')

We see that people that got married are less likely to have a stroke than people than people that didn't get married

In [None]:
df['work_type'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='work_type', hue='stroke')

In [None]:
df.groupby('work_type').sum()['stroke']

In [None]:
df[df['stroke'] == 0].groupby('work_type').count()['gender']

In [None]:
(df.groupby('work_type').sum()['stroke'] / df[df['stroke'] == 0].groupby('work_type').count()['gender']) * 100

We can see that children almost never have a stroke (the same goes for never worked but this may be because we have a very low amount of people in this data). We see that self-employed people are most likely to have a stroke compared to other work_types

In [None]:
df['Residence_type'].value_counts()

In [None]:
# It's good that the amount of people in residence_type is well balanced.
sns.countplot(data=df, x='Residence_type', hue='stroke')

We can clearly see that residence type itself doesn't tell correlate to the strokes

In [None]:
df['avg_glucose_level'].hist(bins=40)

In [None]:
df['bmi'].hist(bins=40)

In [None]:
df.groupby('stroke').mean()

We see that people with higher average glucose level or bmi are more likely to have a stroke

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.scatterplot(data=df[df['stroke'] == 0], x='avg_glucose_level', y='bmi', alpha=0.3, label='No Stroke')
sns.scatterplot(data=df[df['stroke'] == 1], x='avg_glucose_level', y='bmi', alpha=1, ax=ax, label='Stroke')
plt.show()

We see that if we have a high glucose level than we are more likely to have a stroke.

In [None]:
df['smoking_status'].value_counts()

In [None]:
# Quick check why there are many Unknown features in smoking_status
df[df['smoking_status'] == 'Unknown'].describe()

In [None]:
# So Unknown is just a mix of people that for which we don't have any knowledge about their smoking status
sns.countplot(data=df, x='smoking_status', hue='stroke')

In [None]:
(df.groupby('smoking_status').sum()['stroke'] / df[df['stroke'] == 0].groupby('smoking_status').count()['gender']) * 100

We see that the people that used to smoke have the highest chance of having a stroke.
Suprisingly people that don't have smoking status have the lowest chance of having a stroke

# Preparing data for training

In [None]:
df.head()

In [None]:
# Time to create some dummy variables
data = pd.get_dummies(df['gender'], drop_first=True)
df = pd.concat([df, data], axis=1)

data = pd.get_dummies(df['ever_married'], drop_first=True)
df = pd.concat([df, data], axis=1)

data = pd.get_dummies(df['work_type'], drop_first=True)
df = pd.concat([df, data], axis=1)

data = pd.get_dummies(df['Residence_type'], drop_first=True)
df = pd.concat([df, data], axis=1)

data = pd.get_dummies(df['smoking_status'], drop_first=True)
df = pd.concat([df, data], axis=1)

df.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# Time to do some feature scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

norm = MinMaxScaler()
norm.fit(X_train)
X_train = norm.transform(X_train)
X_test = norm.transform(X_test)

In [None]:
X_train

We can see that our data got scaled properly

# Creating machine learning algorithm

## Importing needed libraries for all machine learning algorithms

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
predictions = lg.predict(X_test)

In [None]:
predictions

In [None]:
np.unique(predictions)

In [None]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))

We can see that logistic regression always predicted that a given person didn't have a stroke.
So it's a terrible machine learning algorithm for this data

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

In [None]:
predictions

In [None]:
np.unique(predictions)

In [None]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))

Decision tree didn't do a very good job here but at least it predicted that some people had a strok

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rt = RandomForestClassifier(random_state=101)
rt.fit(X_train, y_train)
predictions = rt.predict(X_test)

In [None]:
predictions

In [None]:
np.unique(predictions)

In [None]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))

Random Forest did very similaras the logistic regression

# Neural networks

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=len(df.columns)-1, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=150, batch_size=10)

In [None]:
predictions = model.predict(X_test).round()

In [None]:
predictions

In [None]:
np.unique(predictions)

In [None]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))

Neural networks did a little bit better job than random forest but still not good enough 

# Improving machine learning algorithm

Decision tree worked the best so we will try to improve it

In [None]:
predictions = dt.predict(X_train)
predictions

In [None]:
np.unique(predictions)

In [None]:
print(confusion_matrix(y_train, predictions))

In [None]:
print(confusion_matrix(y_train, predictions))
print()
print(classification_report(y_train, predictions))
print()
print('F1 score: ', f1_score(y_train, predictions))

Decision tree perfectly fitted our training data but did not do such a good job at the test data

In [None]:
dt = DecisionTreeClassifier(splitter='random', random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

In [None]:
predictions = dt.predict(X_train)
print(confusion_matrix(y_train, predictions))
print()
print(classification_report(y_train, predictions))
print()
print('F1 score: ', f1_score(y_train, predictions))

Better fit on the test data but we can still improve it

In [None]:
# Time to set max_depth to the optimal value
def max_depths_accuracy():
    max_depth_train = {}
    max_depth_test = {}
    for i in range(1, 100):
        dt = DecisionTreeClassifier(splitter='random', max_depth=i, random_state=101)
        dt.fit(X_train, y_train)
        
        predictions = dt.predict(X_train)
        f1 = f1_score(y_train, predictions)
        max_depth_train[i] = f1
        
        predictions = dt.predict(X_test)
        f1 = f1_score(y_test, predictions)
        max_depth_test[i] = f1
    
    return max_depth_train, max_depth_test
    
max_depth_train, max_depth_test = max_depths_accuracy()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=max_depth_train.keys(), y=max_depth_train.values())

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=max_depth_test.keys(), y=max_depth_test.values())

In [None]:
max_depth_train

In [None]:
max_depth_test

This won't really solve our problem but we will keep max_depth at 22 for future improvements

In [None]:
dt = DecisionTreeClassifier(splitter='random', max_depth=22, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

In [None]:
# Time to set min_samples_split to the optimal value
def min_samples_split_accuracy():
    min_samples_split_train = {}
    min_samples_split_test = {}
    for i in range(2, 50):
        dt = DecisionTreeClassifier(splitter='random', max_depth=22, min_samples_split=i, random_state=101)
        dt.fit(X_train, y_train)
        
        predictions = dt.predict(X_train)
        f1 = f1_score(y_train, predictions)
        min_samples_split_train[i] = f1
        
        predictions = dt.predict(X_test)
        f1 = f1_score(y_test, predictions)
        min_samples_split_test[i] = f1
    
    return min_samples_split_train, min_samples_split_test

min_samples_split_train, min_samples_split_test = min_samples_split_accuracy()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=min_samples_split_train.keys(), y=min_samples_split_train.values())

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=min_samples_split_test.keys(), y=min_samples_split_test.values())

In [None]:
min_samples_split_train

In [None]:
min_samples_split_test

Again no improvement

In [None]:
# Maybe min_samples_leaf will improve the algorithm
def min_samples_leaf_accuracy():
    min_samples_leaf_train = {}
    min_samples_leaf_test = {}
    for i in range(1, 30):
        dt = DecisionTreeClassifier(splitter='random', max_depth=22, min_samples_split=2, min_samples_leaf=i, random_state=101)
        dt.fit(X_train, y_train)
        
        predictions = dt.predict(X_train)
        f1 = f1_score(y_train, predictions)
        min_samples_leaf_train[i] = f1
        
        predictions = dt.predict(X_test)
        f1 = f1_score(y_test, predictions)
        min_samples_leaf_test[i] = f1
    
    return min_samples_leaf_train, min_samples_leaf_test

min_samples_leaf_train, min_samples_leaf_test = min_samples_leaf_accuracy()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=min_samples_leaf_train.keys(), y=min_samples_leaf_train.values())

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=min_samples_leaf_test.keys(), y=min_samples_leaf_test.values())

In [None]:
min_samples_leaf_train

In [None]:
min_samples_leaf_test

In [None]:
# Maybe criterion will change something
dt = DecisionTreeClassifier(criterion='entropy', splitter='random',  max_depth=22, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

In [None]:
# We stay at gini criterion
dt = DecisionTreeClassifier(criterion='gini', splitter='random',  max_depth=22, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

# Training our improved machine learning algorithm

In [None]:
dt = DecisionTreeClassifier(criterion='gini', splitter='random',  max_depth=22, min_samples_split=2, 
                                min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

In [None]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 Score: ', f1_score(y_test, predictions))

We've got around 0.22 F1 score. That's a pretty good score for the dataset where we mostly had people that didn't have a stroke