In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df=pd.read_csv("healthcare-dataset-stroke-data.csv")
data=df.copy()
data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [3]:
print(f"Dataset shape: {data.shape}")

print("\n Dataset Information: ")
data.info()

Dataset shape: (5110, 12)

 Dataset Information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
print("\n Statistics of Data")
print(data.describe().T)


 Statistics of Data
                    count          mean           std    min        25%  \
id                 5110.0  36517.829354  21161.721625  67.00  17741.250   
age                5110.0     43.226614     22.612647   0.08     25.000   
hypertension       5110.0      0.097456      0.296607   0.00      0.000   
heart_disease      5110.0      0.054012      0.226063   0.00      0.000   
avg_glucose_level  5110.0    106.147677     45.283560  55.12     77.245   
bmi                4909.0     28.893237      7.854067  10.30     23.500   
stroke             5110.0      0.048728      0.215320   0.00      0.000   

                         50%       75%       max  
id                 36932.000  54682.00  72940.00  
age                   45.000     61.00     82.00  
hypertension           0.000      0.00      1.00  
heart_disease          0.000      0.00      1.00  
avg_glucose_level     91.885    114.09    271.74  
bmi                   28.100     33.10     97.60  
stroke               

In [None]:
#Distribution of Age by Stroke Status
import plotly.express as px

fig = px.histogram(data, x='age', color='stroke', barmode='overlay',
                   nbins=30, labels={'stroke': 'Stroke'},
                   title='Age Distribution by Stroke Status')
fig.show()

In [9]:
#proportion of strokes for each gender.
stroke_by_gender = data.groupby('gender')['stroke'].mean().reset_index()

fig = px.bar(stroke_by_gender, x='gender', y='stroke',
             labels={'stroke': 'Stroke Rate'},
             title='Stroke Rate by Gender')
fig.show()


In [12]:
#Scatter plot to see the relationship between glucose level and age, highlighting stroke cases.
fig = px.scatter(data, x='age', y='avg_glucose_level', color='stroke',
                 labels={'stroke': 'Stroke'},
                 title='Average Glucose Level vs. Age by Stroke Status')
fig.show()


In [13]:
#which work types have higher proportions of stroke.
stroke_by_work = df.groupby('work_type')['stroke'].mean().reset_index()

fig = px.bar(stroke_by_work, x='work_type', y='stroke',
             labels={'stroke': 'Stroke Rate'},
             title='Stroke Rate by Work Type')
fig.show()

In [15]:
#BMI Distribution by Stroke Status
fig = px.box(data, x='stroke', y='bmi', points='all',
             labels={'stroke': 'Stroke', 'bmi': 'BMI'},
             title='BMI Distribution by Stroke Status')
fig.show()


In [17]:
#Stroke Rate by Smoking Status
stroke_by_smoking = data.groupby('smoking_status')['stroke'].mean().reset_index()

fig = px.bar(stroke_by_smoking, x='smoking_status', y='stroke',
             labels={'stroke': 'Stroke Rate'},
             title='Stroke Rate by Smoking Status')
fig.show()

In [18]:
#heatmap: Hypertension & Heart Disease vs. Stroke
import plotly.graph_objects as go

heatmap_data = data.groupby(['hypertension', 'heart_disease'])['stroke'].mean().reset_index()
heatmap_matrix = heatmap_data.pivot(index='hypertension', columns='heart_disease', values='stroke')

fig = go.Figure(data=go.Heatmap(
    z=heatmap_matrix.values,
    x=['No Heart Disease', 'Heart Disease'],
    y=['No Hypertension', 'Hypertension'],
    colorscale='Reds',
    colorbar=dict(title='Stroke Rate')
))
fig.update_layout(title='Stroke Rate by Hypertension and Heart Disease')
fig.show()


In [22]:

#histogram: Stroke Distribution by Residence Type and Gender

fig = px.histogram(
    data, 
    x='Residence_type', 
    color='gender', 
    barmode='group', 
    facet_col='stroke',
    category_orders={'stroke': [0, 1]},
    title='Stroke Distribution by Residence Type and Gender',
    labels={'stroke': 'Stroke'},
    color_discrete_map={
        'Male': '#1f77b4',    # blue
        'Female': '#e377c2',  # pink
        'Other': '#2ca02c'    # green
    }
)
fig.show()


In [24]:
#Violin Plot: Age Distribution by Stroke and Hypertension
fig = px.violin(
    data, 
    y='age', 
    x='stroke', 
    color='hypertension',
    box=True, 
    points='all',
    title='Age Distribution by Stroke and Hypertension'
)
fig.show()


In [25]:
#Pie Chart: Stroke Proportion by Marital Status
married_counts = df[data['stroke']==1]['ever_married'].value_counts().reset_index()
married_counts.columns = ['ever_married', 'count']

fig = px.pie(
    married_counts, 
    names='ever_married', 
    values='count',
    title='Proportion of Stroke Cases by Marital Status'
)
fig.show()


In [27]:
#Box Plot: Glucose Level by Stroke and Smoking Status
fig = px.box(
    df, 
    x='smoking_status', 
    y='avg_glucose_level', 
    color='stroke',
    title='Glucose Level by Stroke and Smoking Status'
)
fig.show()


In [28]:
#Bar: Stroke Rate by Age Group
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 75, 100], labels=['0-30', '31-45', '46-60', '61-75', '76+'])
age_group_stroke = df.groupby('age_group')['stroke'].mean().reset_index()

fig = px.bar(
    age_group_stroke, 
    x='age_group', 
    y='stroke',
    title='Stroke Rate by Age Group',
    labels={'stroke': 'Stroke Rate'}
)
fig.show()




