In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head(5)

In [None]:
labels = df.smoking_status.unique()
labels = list(labels)
idx = labels.index("smokes")
labels = [labels[idx]] + labels[:idx]+labels[idx+1:]
sizes = df.groupby(["smoking_status"]).count()["id"]
sizes = [sizes[l] for l in labels]
# sizes = [sizes[idx]] + sizes[:idx]+sizes[idx+1:]
sizes = np.array(sizes)
sizes = sizes/sizes.sum()
print(labels, sizes)
explode = [0.1,0.,0.,0.]

fig = plt.figure(figsize=(9, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
fig.subplots_adjust(wspace=0)

# pie chart parameters
# rotate so that first wedge is split by the x-axis
angle = -180 * sizes[0]
ax1.pie(sizes, autopct='%1.1f%%', startangle=angle,
        labels=labels, explode=explode)

# bar chart parameters

xpos = 0
bottom = 0
ratios = np.array(df[df["smoking_status"] == "smokes"].groupby(["stroke"]).count()["id"])
ratios = ratios/ratios.sum()
width = .2

print(ratios)
for j in range(len(ratios)):
    height = ratios[j]
    ax2.bar(xpos, height, width, bottom=bottom)
    ypos = bottom + ax2.patches[j].get_height() / 2
    bottom += height
    ax2.text(xpos, ypos, "%d%%" % (ax2.patches[j].get_height() * 100),
             ha='center')

ax2.set_title('Stroke')
ax2.legend(("No", "Yes"))
ax2.axis('off')
ax2.set_xlim(- 2.5 * width, 2.5 * width)

# use ConnectionPatch to draw lines between the two plots
# get the wedge data
theta1, theta2 = ax1.patches[0].theta1, ax1.patches[0].theta2
center, r = ax1.patches[0].center, ax1.patches[0].r
bar_height = sum([item.get_height() for item in ax2.patches])

# draw top connecting line
x = r * np.cos(np.pi / 180 * theta2) + center[0]
y = r * np.sin(np.pi / 180 * theta2) + center[1]
con = ConnectionPatch(xyA=(-width / 2, bar_height), coordsA=ax2.transData,
                      xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
con.set_linewidth(4)
ax2.add_artist(con)

# draw bottom connecting line
x = r * np.cos(np.pi / 180 * theta1) + center[0]
y = r * np.sin(np.pi / 180 * theta1) + center[1]
con = ConnectionPatch(xyA=(-width / 2, 0), coordsA=ax2.transData,
                      xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
ax2.add_artist(con)
con.set_linewidth(4)

plt.show()

roughly 5% of people who smoke are likely to get a stroke. But this doesn't mean smoking doesn't pose risk. It is also likely that many smokers die of other diseases (lung cancer)

In [None]:
df1 = df.groupby(["smoking_status"])["stroke"].agg(["sum","count"]).reset_index()
df1["proportion"] = df1["sum"]/df1["count"]*100

In [None]:
df1[["smoking_status","proportion"]].set_index("smoking_status").plot(kind = "bar")
plt.ylabel("Percentage")

The graph above validates that people who have smoked are likely to develop strokes that can lead to death

In [None]:
df1 = df[df.stroke == 1]["age"]
df2 = df[df.stroke == 0]["age"]

In [None]:
plt.hist(df1, bins = 50,alpha=1, label='stroke')

Older age people are more likely to receive stroke compared to youths

In [None]:
df.groupby(["ever_married"]).agg("sum")["stroke"].plot(kind = "bar")

It appears that people who are married are more likely to get a stroke. But this maybe because age distribution within the group. We validate this hypothesis graphically below.

In [None]:
sns.violinplot(x="ever_married", y="age", data=df)

As we can see, people who are married have an average age of 55+ while people who aren't married have an average age of ~20.

In [None]:
df1 = pd.crosstab([df.heart_disease,df.hypertension],df.stroke).reset_index()
df1.columns = ['heart_disease', 'hypertension', "no", "yes"]
df1["sum"] = df1[["no","yes"]].sum(axis = 1)
df1["no"]= df1["no"]/df1["sum"]
df1["yes"]= df1["yes"]/df1["sum"]

fig, axs = plt.subplots(1,2, figsize = (10,5))
df1_0 = df1[df1.heart_disease == 0]
x = [str(i) for i in df1_0["hypertension"]]
axs[0].bar(x, df1_0["no"], label = "No Stroke")
axs[0].bar(x, df1_0["yes"], bottom = df1_0["no"], label = "Stroke")
# axs[0].legend()
axs[0].set_xlabel("Hypertension")
axs[0].set_title("No Heart disease")
df1_0 = df1[df1.heart_disease == 1]
x = [str(i) for i in df1_0["hypertension"]]
axs[1].bar(x, df1_0["no"], label = "No Stroke")
axs[1].bar(x, df1_0["yes"], bottom = df1_0["no"], label = "Stroke")
axs[1].set_title("Heart Disease")
axs[1].legend()
axs[1].set_xlabel("Hypertension")

del df1_0

As we can see people with Hypertension and heart disease are more likely to get a stroke. when compared with people who don't have hypertension or heart disease.