In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import plotly.express as px

*Importing the dataset onto the platform*

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

df.head()

*Changing the column names*

In [None]:
headers = ["ID", "Gender", "Age", "Hypertension", "Heart Disease", "Ever Married", "Work Type", "Residence Type", "Avg. Glucose Level", "BMI", "Smoking Status", "Stroke"]

df.columns = headers

df.head()

*Checking data types of the variables in the dataset to see if they are correct*

In [None]:
df.dtypes

*Checking for Null Values*

In [None]:
df.isnull().sum()

In [None]:
df.count()

Since the whole dataset has 5,110 values and the variable "BMI" has only 201 missing values, it would be safe to remove the rows with these Null values without having any negative effect on the analysis to be conducted. 

*Removing Null values from the dataset and resetting the index*

In [None]:
df.dropna(axis=0, inplace=True)

df.reset_index(drop=True, inplace=True)

df.isnull().sum()

*Replacing 0 and 1 with "No" and "Yes" in Hypertension and Heart Disease columns*

In [None]:
df["Hypertension"].replace([0,1], ["No","Yes"], inplace=True)
df["Heart Disease"].replace([0,1], ["No","Yes"], inplace=True)

df.head()

*Studying the variables and data that is in the dataset*

In [None]:
df.describe(include='all')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=4, cols=3)

fig.add_trace(
    go.Box(y=df["Avg. Glucose Level"], name="Avg. Glucose Level"),
    row=1, col=1
)

fig.add_trace(
    go.Box(y=df["Age"], name="Age"),
    row=1, col=2
)

fig.add_trace(
    go.Box(y=df["BMI"], name="BMI"),
    row=1, col=3
)

fig.add_trace(
    go.Histogram(x=df["Gender"], name="Gender"),
    row=2, col=1
)

fig.add_trace(
    go.Histogram(x=df["Ever Married"], name="Ever Married"),
    row=2, col=2
)

fig.add_trace(
    go.Histogram(x=df["Work Type"], name="Work Type"),
    row=2, col=3
)

fig.add_trace(
    go.Histogram(x=df["Residence Type"], name="Residence Type"),
    row=3, col=1
)

fig.add_trace(
    go.Histogram(x=df["Smoking Status"], name="Smoking Status"),
    row=3, col=2
)

fig.add_trace(
    go.Histogram(x=df["Hypertension"], name="Hypertension"),
    row=3, col=3
)

fig.add_trace(
    go.Histogram(x=df["Heart Disease"], name="Heart Disease"),
    row=4, col=2
)


fig.update_layout(height=1500, width=1000, title_text="Variables in the Dataset", title_font_size=22,
                  title_y=0.97, title_x=0.45, legend_title="Variable")

fig.show()


In [None]:
df.head()

Categorising BMI into "Underweight", "Normal Weight", "Overweight" and "Obese"

In [None]:
Results=[]

for i in df["BMI"]:
    
    if (i<18.5):
        Results.append("Underweight")
            
    elif (i>=18.5) & (i<24.9):
         Results.append("Normal Weight")
   
            
    elif (i>=25) & (i<29.9):
        Results.append("Overweight")
            
    elif (i>30):
        Results.append("Obese")
    
Results2 = pd.DataFrame(Results, columns=["BMI Category"])

df["BMI Category"] = Results2

df.head(10)

Since there is a mismatch between the number of males and females in the dataset, it would make sense to use a dataset with equal number of males and females, which will result in accurate results when analysis is conducted using gender as a variable.

In [None]:
male = df[df["Gender"]=="Male"].head(2000)

female = df[df["Gender"]=="Female"].head(2000)

df = pd.concat([male,female], axis=0, join='outer', ignore_index=True)

df

In [None]:
df_age = df.groupby("Age", as_index=False)["Stroke"].sum()
df_gender = df.groupby("Gender", as_index=False)["Stroke"].sum()
df_age_gender = df.groupby(["Gender","Age"], as_index=False)["Stroke"].sum()

fig = px.histogram(df_age, x="Age", y="Stroke", barmode="group", nbins=9, opacity=0.75, range_x=[0,85], 
                   color_discrete_sequence=px.colors.qualitative.G10)

fig2 = px.bar(df_gender, x="Gender", y="Stroke", color="Gender", barmode='group', opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Dark2)

fig3 = px.histogram(df_age_gender, x="Age", y="Stroke", color="Gender", barmode="group", nbins=9, opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(height=550, width=750, title_text="Occurence of Strokes per Age Bracket", title_font_size=22,
                  title_y=0.97, title_x=0.50, yaxis_title ="Stroke Occurence")

fig2.update_layout(height=550, width=750, title_text="Occurence of Strokes per Gender", title_font_size=22,
                  title_y=0.97, title_x=0.50, yaxis_title ="Stroke Occurence")

fig3.update_layout(height=550, width=750, title_text="Occurence of Strokes per Age Bracket & Gender", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig2.show()
fig.show()
fig3.show()

In [None]:
df_BMI = df.groupby(["BMI Category"], as_index=False)["Stroke"].sum()
df_BMI_gender = df.groupby(["BMI Category","Gender"], as_index=False)["Stroke"].sum()

fig = px.bar(df_BMI, x="BMI Category", y="Stroke", color="BMI Category", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Bold)

fig2 = px.bar(df_BMI_gender, x="BMI Category", y="Stroke", color="Gender", barmode='group', opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Dark2)

fig.update_layout(height=700, width=750, title_text="Occurence of Strokes per BMI Category", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig2.update_layout(height=700, width=750, title_text="Occurence of Strokes per Gender & BMI Category", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig.show()
fig2.show()

In [None]:
df_smoking_status = df.groupby(["Smoking Status"], as_index=False)["Stroke"].sum()
df_work_type = df.groupby(["Work Type"], as_index=False)["Stroke"].sum()
df_residence_type = df.groupby(["Residence Type"], as_index=False)["Stroke"].sum()

fig = px.bar(df_smoking_status, x="Smoking Status", y="Stroke", color="Smoking Status", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Prism)

fig2 = px.bar(df_work_type, x="Work Type", y="Stroke", color="Work Type", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Prism)

fig3 = px.bar(df_residence_type, x="Residence Type", y="Stroke", color="Residence Type", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Prism)

fig.update_layout(height=500, width=650, title_text="Occurence of Strokes per Smoking Status", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig2.update_layout(height=500, width=650, title_text="Occurence of Strokes per Work Type", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig3.update_layout(height=500, width=650, title_text="Occurence of Strokes per Residence Type", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig.show()
fig2.show()
fig3.show()

In [None]:
df_heartdisease = df.groupby(["Heart Disease"], as_index=False)["Stroke"].sum()
df_hypertension = df.groupby(["Hypertension"], as_index=False)["Stroke"].sum()
df_married = df.groupby(["Ever Married"], as_index=False)["Stroke"].sum()


fig = px.bar(df_heartdisease, x="Heart Disease", y="Stroke", color="Heart Disease", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Set1)

fig2 = px.bar(df_hypertension, x="Hypertension", y="Stroke", color="Hypertension", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Set1)

fig3 = px.bar(df_married, x="Ever Married", y="Stroke", color="Ever Married", opacity=1,
                   color_discrete_sequence=px.colors.qualitative.Set1)

fig.update_layout(height=500, width=550, title_text="Occurence of Strokes w/wo Heart Disease", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig2.update_layout(height=500, width=550, title_text="Occurence of Strokes w/wo Hypertension", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig3.update_layout(height=500, width=550, title_text="Occurence of Strokes (Married or Not)", title_font_size=22,
                  title_y=0.97, title_x=0.48, yaxis_title ="Stroke Occurence")

fig.show()
fig2.show()
fig3.show()

In [None]:
len(df[df["Stroke"]==1])

In [None]:
len(df[df["Stroke"]==0])

In [None]:
df2 = df[["Gender","Age","Hypertension","Heart Disease","Ever Married","Work Type","Avg. Glucose Level","BMI", "Stroke"]]

df2.head()

Getting dummy variables for the categorical variables so that categorical supervised learning models can be created

In [None]:
gender = pd.get_dummies(df2["Gender"], drop_first=True)
hypertension = pd.get_dummies(df2["Hypertension"], drop_first=True, prefix="HT")
heartdisease = pd.get_dummies(df2["Heart Disease"], drop_first=True, prefix="HD")
evermarried = pd.get_dummies(df2["Ever Married"], drop_first=True, prefix="EM")
worktype = pd.get_dummies(df2["Work Type"], drop_first=True)

df3 = pd.concat([df2,gender,hypertension,heartdisease,evermarried,worktype], axis=1, join='outer', ignore_index=False)

df3.drop(["Gender","Hypertension","Heart Disease","Ever Married","Work Type"], axis=1, inplace=True)

df4 = df3.reindex(labels=["Age","Male","HT_Yes","HD_Yes","EM_Yes","Never_worked","Private","Self-employed","children","BMI","Avg. Glucose Level","Stroke"], axis=1)

df4.head(10)

In [None]:
print("Rows containing 0 = ", len(df4[df4["Stroke"]==0]))

print("Rows containing 1 = ", len(df4[df4["Stroke"]==1]))

Since there is a huge mismatch between the number of people who had a stroke and number of people who did not have a stroke, our supervised learning models will be extremely skewed to the "No Stroke" prediction values. Therefore, to stop this and to get an accurate prediction model, we would need to downsample our data so that we have equal number of values for "Stroke" and "No Stroke" 

In [None]:
balance1 = df4[df4["Stroke"]==1].sample(n=209, replace=False)
balance1.reset_index(drop=True, inplace=True)

balance0 = df4[df4["Stroke"]==0].sample(n=209, replace=False)
balance0.reset_index(drop=True, inplace=True)

df5 = pd.concat([balance1, balance0], axis=0, join='outer')

print("Rows containing 0 = ", len(df5[df5["Stroke"]==0]))

print("Rows containing 1 = ", len(df5[df5["Stroke"]==1]))

In [None]:
X = df5[["Age","Male","HT_Yes","HD_Yes","EM_Yes","Never_worked","Private","Self-employed","children","BMI","Avg. Glucose Level"]]

y = df5["Stroke"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

clf = LogisticRegression()
clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

def generate_model_report(y_test, y_pred):
    print("Accuracy = ", accuracy_score(y_test, y_pred))
    print("Precision = ", precision_score(y_test, y_pred))
    print("Recall = ", recall_score(y_test, y_pred))
    print("F1 = ", f1_score(y_test, y_pred))
    pass

generate_model_report(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred, labels=[0,1])

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier()
clf2 = clf2.fit(X_train,y_train)

y_pred = clf2.predict(X_test)

def generate_model_report(y_test, y_pred):
    print("Accuracy = ", accuracy_score(y_test, y_pred))
    print("Precision = ", precision_score(y_test, y_pred))
    print("Recall = ", recall_score(y_test, y_pred))
    print("F1 = ", f1_score(y_test, y_pred))
    pass

generate_model_report(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf3 = DecisionTreeClassifier()
clf3 = clf3.fit(X_train,y_train)

y_pred = clf3.predict(X_test)

def generate_model_report(y_test, y_pred):
    print("Accuracy = ", accuracy_score(y_test, y_pred))
    print("Precision = ", precision_score(y_test, y_pred))
    print("Recall = ", recall_score(y_test, y_pred))
    print("F1 = ", f1_score(y_test, y_pred))
    pass

generate_model_report(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

According to the results above, the best model to predict strokes is the Random Forest model since it gave the highest F1 score amongst the 3 models of 0.72.