In [3]:
# Importing dependencies
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt

In [4]:
# Load the crypto_data.csv dataset.
file_path=("healthcare-dataset-stroke-data.csv")
stroke_df=pd.read_csv(file_path)
print(stroke_df.shape)
stroke_df.head(10)

(5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [5]:
# Remove "ID" column
stroke_all_df= stroke_df.drop("id",axis=1)
stroke_all_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
# Rename Column Names
column =["Gender","Age","Hypertension","Heart_Disease","Marriage_Status","Work_Type","Residence_Type","Average_Glucose","BMI","Smoking_Status","Stroke"]
stroke_all_df.columns=column
stroke_all_df.head()

Unnamed: 0,Gender,Age,Hypertension,Heart_Disease,Marriage_Status,Work_Type,Residence_Type,Average_Glucose,BMI,Smoking_Status,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
# Look for NAN
stroke_all_df.isnull().values.sum()

201

In [8]:
# NAN count in each column
stroke_all_df[column].isnull().sum()

Gender               0
Age                  0
Hypertension         0
Heart_Disease        0
Marriage_Status      0
Work_Type            0
Residence_Type       0
Average_Glucose      0
BMI                201
Smoking_Status       0
Stroke               0
dtype: int64

In [9]:
# Drop BMI NAN values from stroke_all_df
stroke_df = stroke_all_df.dropna()


In [10]:
# NAN count in the new dataframe stroke_df
stroke_df[column].isnull().sum()

Gender             0
Age                0
Hypertension       0
Heart_Disease      0
Marriage_Status    0
Work_Type          0
Residence_Type     0
Average_Glucose    0
BMI                0
Smoking_Status     0
Stroke             0
dtype: int64

In [11]:
# Shape of the new dataframe stroke_df
stroke_df.shape

(4909, 11)

In [12]:
# Look for datatypes
stroke_df.dtypes

Gender              object
Age                float64
Hypertension         int64
Heart_Disease        int64
Marriage_Status     object
Work_Type           object
Residence_Type      object
Average_Glucose    float64
BMI                float64
Smoking_Status      object
Stroke               int64
dtype: object

In [13]:
# Storing the stroke_df with NaN rows removed as a separate CSV
stroke_df.to_csv("stroke_no_NaN.csv")

In [14]:
# Looking at value counts for Gender column
stroke_df["Gender"].value_counts()

Female    2897
Male      2011
Other        1
Name: Gender, dtype: int64

In [15]:
# Deleting row with "Other" value in Gender column
stroke_df=stroke_df[stroke_df.Gender!= "Other"]

In [16]:
# Checking Gender valuecount
stroke_df["Gender"].value_counts()

Female    2897
Male      2011
Name: Gender, dtype: int64

In [17]:
# Value count of Smoking_Status
stroke_df["Smoking_Status"].value_counts()

never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: Smoking_Status, dtype: int64

In [35]:
stroke_df.head()

Unnamed: 0,Gender,Age,Hypertension,Heart_Disease,Marriage_Status,Work_Type,Residence_Type,Average_Glucose,BMI,Smoking_Status,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [36]:
# Pie Plot for Gender
# Pie Plot for Work Type
# Histogram for Age
# Pie Plot for Residence_Type
# Bar Plot for Smoking_Status
# Pie Plot for Marriage_Status
# Pie Plot for Hyper Tension
# Pie Plot for Heart_Disease
# Pie Plot for Stroke
# Categorize BMI to weight range and create a pie plot
# Categorize Glucose to pre-Diabetes, no Diabetes and Diabetes and create a pie plot



In [None]:
# Line plot with AGE as X axis and Avg Glucose and BMI in Y axis
# Bar Plot for Average Glucose and BMI (categorize Avg Glucose and BMI in ranges)