# Data Collection

In [49]:
import os
import pandas as pd
df= pd.read_csv('Downloads/healthcare-dataset-stroke-data.csv')

# Data Definition

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [7]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Check value counts for the categorical features:

In [10]:
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [11]:
df['hypertension'].value_counts()

0    4612
1     498
Name: hypertension, dtype: int64

In [12]:
df['heart_disease'].value_counts()

0    4834
1     276
Name: heart_disease, dtype: int64

In [13]:
df['ever_married'].value_counts()

Yes    3353
No     1757
Name: ever_married, dtype: int64

In [14]:
df['work_type'].value_counts()

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [15]:
df['Residence_type'].value_counts()

Urban    2596
Rural    2514
Name: Residence_type, dtype: int64

In [16]:
df['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [17]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

This dataset has 3 features with a float data type. Let's take a look at their distributions...

In [18]:
df['age'].describe()

count    5110.000000
mean       43.226614
std        22.612647
min         0.080000
25%        25.000000
50%        45.000000
75%        61.000000
max        82.000000
Name: age, dtype: float64

In [19]:
df['avg_glucose_level'].describe()

count    5110.000000
mean      106.147677
std        45.283560
min        55.120000
25%        77.245000
50%        91.885000
75%       114.090000
max       271.740000
Name: avg_glucose_level, dtype: float64

In [20]:
df['bmi'].describe()

count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64

# Data Cleaning

In the Data Definition step, I determined via the .info() method that there are null values in, and only in, the bmi column. This may give rise to complications in the pre-processing and modeling steps, but a missing bmi value in and of itself does not lower the quality of a row to an extent that would require me to drop the entry.

In [37]:
df['id'].duplicated().any()

False

id is the only column where we don't want any duplicates, so this is good.

The maximum and the minimum values for the bmi column appear to be out of the range of plausibility. It may be useful to filter out some values at the top and bottom ends of the distribution. The NHSN has defined BMIs below 12 or above 60 as outliers.

In [25]:
bmisorted = df.sort_values(by='bmi')

In [33]:
bmisorted[:20]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1609,38043,Female,1.24,0,0,No,children,Rural,122.04,10.3,Unknown,0
3307,3205,Female,79.0,0,0,Yes,Self-employed,Urban,79.03,11.3,Unknown,0
2187,59993,Male,40.0,0,0,Yes,Private,Rural,60.96,11.5,never smoked,0
657,20364,Female,4.0,0,0,No,children,Urban,107.25,12.0,Unknown,0
922,45893,Female,8.0,0,0,No,children,Urban,106.51,12.3,Unknown,0
3319,53924,Female,1.08,0,0,No,children,Urban,159.39,12.8,Unknown,0
3968,41500,Male,0.16,0,0,No,children,Rural,69.79,13.0,Unknown,0
3619,52859,Female,4.0,0,0,No,children,Urban,61.54,13.2,Unknown,0
4694,31113,Female,1.16,0,0,No,children,Urban,86.0,13.3,Unknown,0
1701,4789,Male,8.0,0,0,No,children,Rural,91.54,13.4,Unknown,0


Interestingly enough, none of the individuals with a bottom 20 bmi suffered a stroke!

In [31]:
bmisorted_desc = df.sort_values(by='bmi', ascending=False)

In [34]:
bmisorted_desc[:20]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2128,56420,Male,17.0,1,0,No,Private,Rural,61.67,97.6,Unknown,0
4209,51856,Male,38.0,1,0,Yes,Private,Rural,56.9,92.0,never smoked,0
928,41097,Female,23.0,1,0,No,Private,Urban,70.03,78.0,smokes,0
544,545,Male,42.0,0,0,Yes,Private,Rural,210.48,71.9,never smoked,0
1559,37759,Female,53.0,0,0,Yes,Private,Rural,72.63,66.8,Unknown,0
358,66333,Male,52.0,0,0,Yes,Self-employed,Urban,78.4,64.8,never smoked,0
4188,70670,Female,27.0,0,0,Yes,Private,Rural,57.96,64.4,never smoked,0
2764,20292,Female,24.0,0,0,Yes,Private,Urban,85.55,63.3,never smoked,0
3825,72784,Female,52.0,0,0,Yes,Private,Rural,118.46,61.6,smokes,0
2840,65895,Female,52.0,0,0,Yes,Private,Urban,98.27,61.2,Unknown,0


The same can be said for the individuals with a top 20 bmi! None of them suffered a stroke! We should be very skeptical of these entries. I am going to follow the NIH's guidance by removing the entries that have a bmi below 12 and above 60, but it's fair to wonder whether it would be wise to go beyond that.

In [44]:
df.drop(df[df['bmi'] < 12].index)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [46]:
df.drop(df[df['bmi'] > 60].index)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [47]:
datapath = '../data'
save_file(df, 'df_cleaned.csv', datapath)

NameError: name 'save_file' is not defined