### Data Preprocessing


In [411]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [412]:
# read csv file
df = pd.read_csv("./Datasets/train_strokes.csv")


In [413]:
# display dataframe
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [414]:
# drop "id" column
df.drop(['id'], axis=1, inplace=True)

In [415]:
df
## key:
# gender: will use one hot encoder
## hypertension: 0 no hypertension, 1 yes hypertension
## heart_disease: 0 no heart diseases, 1 yes heart disease
## work_type: will use one hot encoder
## residence_type: will use one hot encoder
## avg_glucose_level: 
    ## use normal distribution
## smoking_status: will use one hot encoder
## Stroke:  0 = no stroke, 1 = stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [416]:
# see how many pt had a stroke
df['stroke'].value_counts()

0    42617
1      783
Name: stroke, dtype: int64

In [418]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21644
Residence_type,Urban,21756
ever_married,No,15462
ever_married,Yes,27938
gender,Other,11
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6562
smoking_status,formerly smoked,7493
smoking_status,never smoked,16053


### Drop 'Other' value in 'gender' column

In [419]:
# see how many "other" values are and how many had a stroke
df[(df['gender'] == 'Other') & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [420]:
# Drop 'other' gender - we do not know the premise of that selection 
# and no one with the gender 'other' had a stroke
df = df[df.gender != 'Other']
print(f"{df['gender'].value_counts()}")

Female    25665
Male      17724
Name: gender, dtype: int64


In [421]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'Never_worked', 'children': 'Never_worked'}})

In [422]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21638
Residence_type,Urban,21751
ever_married,No,15456
ever_married,Yes,27933
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6561
smoking_status,formerly smoked,7487
smoking_status,never smoked,16051
work_type,Govt_job,5438


### Replace null values in 'bmi' and 'smoking status' columns


In [423]:
for column in df:
    print(df[column].isnull().value_counts())

False    43389
Name: gender, dtype: int64
False    43389
Name: age, dtype: int64
False    43389
Name: hypertension, dtype: int64
False    43389
Name: heart_disease, dtype: int64
False    43389
Name: ever_married, dtype: int64
False    43389
Name: work_type, dtype: int64
False    43389
Name: Residence_type, dtype: int64
False    43389
Name: avg_glucose_level, dtype: int64
False    41931
True      1458
Name: bmi, dtype: int64
False    30099
True     13290
Name: smoking_status, dtype: int64
False    43389
Name: stroke, dtype: int64


In [424]:
# see how many nan in 'bmi' are attributing to strokes
#140 of 783
df[(df['bmi'].isna()) & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
81,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
407,Female,59.0,0,0,Yes,Private,Rural,76.15,,,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,,,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,,,1
1613,Male,58.0,0,0,Yes,Private,Rural,189.84,,,1
...,...,...,...,...,...,...,...,...,...,...,...
42530,Male,66.0,0,0,Yes,Self-employed,Urban,182.89,,never smoked,1
42839,Female,67.0,1,0,Yes,Govt_job,Urban,234.43,,never smoked,1
43007,Female,69.0,0,1,Yes,Self-employed,Rural,89.19,,smokes,1
43100,Male,67.0,0,0,Yes,Self-employed,Urban,136.79,,smokes,1


In [425]:
# check data type
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [426]:
df = df.fillna(0)

df[(df['bmi'] == 0) & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
81,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,0.0,never smoked,1
407,Female,59.0,0,0,Yes,Private,Rural,76.15,0.0,0,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,0.0,0,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,0.0,0,1
1613,Male,58.0,0,0,Yes,Private,Rural,189.84,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
42530,Male,66.0,0,0,Yes,Self-employed,Urban,182.89,0.0,never smoked,1
42839,Female,67.0,1,0,Yes,Govt_job,Urban,234.43,0.0,never smoked,1
43007,Female,69.0,0,1,Yes,Self-employed,Rural,89.19,0.0,smokes,1
43100,Male,67.0,0,0,Yes,Self-employed,Urban,136.79,0.0,smokes,1


In [427]:
#https://stackoverflow.com/questions/39690742/convert-float-to-int-and-leave-nulls
#df['b'] = df['b'].astype('Int64')

df['bmi'] = df['bmi'].values.astype(int)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,0,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17,0,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,0,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33,never smoked,0


In [428]:
# check to see how many 0's are in the 'bmi' column
df[(df['bmi'] == 0) & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
81,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,0,never smoked,1
407,Female,59.0,0,0,Yes,Private,Rural,76.15,0,0,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,0,0,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,0,0,1
1613,Male,58.0,0,0,Yes,Private,Rural,189.84,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
42530,Male,66.0,0,0,Yes,Self-employed,Urban,182.89,0,never smoked,1
42839,Female,67.0,1,0,Yes,Govt_job,Urban,234.43,0,never smoked,1
43007,Female,69.0,0,1,Yes,Self-employed,Rural,89.19,0,smokes,1
43100,Male,67.0,0,0,Yes,Self-employed,Urban,136.79,0,smokes,1


In [429]:
# check null values
# for i in df.columns:
#     x = df[i].isna().value_counts()
#     print("Column name is:",i,"and the amount of null values is:",x)

In [430]:
# Replace 'bmi' 0 values with mean of bmi
mean_bmi = np.round(df['bmi'].mean())
print(mean_bmi)

27.0


In [431]:
# Replace 'bmi' 0 values with mean of bmi
df['bmi'] = df['bmi'].replace(0, mean_bmi)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,0,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17,0,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,0,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33,never smoked,0


In [432]:
# All NaN werer replaced with 0, so we need to replace 0 in 'smoking_status' column to 'unknown'
df['smoking_status'] = df['smoking_status'].replace(to_replace=0, value="unknown") 

In [433]:
df[(df['smoking_status'] == 'unknown') & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
407,Female,59.0,0,0,Yes,Private,Rural,76.15,27,unknown,1
426,Female,78.0,0,0,Yes,Private,Urban,58.57,24,unknown,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,27,unknown,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,27,unknown,1
1315,Male,82.0,0,1,Yes,Private,Rural,208.30,32,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...
42110,Female,80.0,0,0,No,Private,Urban,222.87,27,unknown,1
42569,Male,60.0,0,0,Yes,Private,Urban,88.57,44,unknown,1
43051,Female,80.0,0,0,Yes,Self-employed,Rural,114.61,21,unknown,1
43130,Female,82.0,0,1,Yes,Self-employed,Urban,118.61,29,unknown,1


In [442]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21638
Residence_type,Urban,21751
ever_married,No,15456
ever_married,Yes,27933
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6561
smoking_status,formerly smoked,7487
smoking_status,unknown,13290
smoking_status,never smoked,16051


In [434]:
# check if all null values are gone 
for column in df:
    print(df[column].isnull().value_counts())

False    43389
Name: gender, dtype: int64
False    43389
Name: age, dtype: int64
False    43389
Name: hypertension, dtype: int64
False    43389
Name: heart_disease, dtype: int64
False    43389
Name: ever_married, dtype: int64
False    43389
Name: work_type, dtype: int64
False    43389
Name: Residence_type, dtype: int64
False    43389
Name: avg_glucose_level, dtype: int64
False    43389
Name: bmi, dtype: int64
False    43389
Name: smoking_status, dtype: int64
False    43389
Name: stroke, dtype: int64


### Normalize integer attributes: 'age',  'avg_glucose_level', 'bmi' 

In [435]:
#https://www.geeksforgeeks.org/data-normalization-with-pandas/

In [436]:
# add normalized 'age' column
df['age_normalized'] = (df['age'].min())/(df['age'].max()-df['age'].min())
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_normalized
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,unknown,0,0.000977
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0,0.000977
2,Female,8.0,0,0,No,Private,Urban,110.89,17,unknown,0,0.000977
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0,0.000977
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,unknown,0,0.000977
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20,never smoked,0,0.000977
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55,formerly smoked,0,0.000977
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28,formerly smoked,0,0.000977
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33,never smoked,0,0.000977


In [437]:
# add normalized 'avg_glucose_level' column
df['avg_glucose_level_normalized']=(df['avg_glucose_level']-df['avg_glucose_level'].min())/(df['avg_glucose_level'].max()-df['avg_glucose_level'].min())
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_normalized,avg_glucose_level_normalized
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,unknown,0,0.000977,0.169964
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0,0.000977,0.139631
2,Female,8.0,0,0,No,Private,Urban,110.89,17,unknown,0,0.000977,0.236772
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0,0.000977,0.059479
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,unknown,0,0.000977,0.450244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20,never smoked,0,0.000977,0.015420
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55,formerly smoked,0,0.000977,0.671934
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28,formerly smoked,0,0.000977,0.156492
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33,never smoked,0,0.000977,0.187079


In [438]:
# add normalized bmi
df['bmi_normalized']=(df['bmi']-df['bmi'].min())/(df['bmi'].max()-df['bmi'].min())
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_normalized,avg_glucose_level_normalized,bmi_normalized
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,unknown,0,0.000977,0.169964,0.091954
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0,0.000977,0.139631,0.333333
2,Female,8.0,0,0,No,Private,Urban,110.89,17,unknown,0,0.000977,0.236772,0.080460
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0,0.000977,0.059479,0.287356
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,unknown,0,0.000977,0.450244,0.103448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20,never smoked,0,0.000977,0.015420,0.114943
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55,formerly smoked,0,0.000977,0.671934,0.517241
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28,formerly smoked,0,0.000977,0.156492,0.206897
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33,never smoked,0,0.000977,0.187079,0.264368


In [440]:
# how many people did not have a stroke (0) vs how many people had a stroke (1)
df['stroke'].value_counts()

0    42606
1      783
Name: stroke, dtype: int64

In [443]:
# save csv

#df.to_csv('Datasets/train_stroke_data_cleaned.csv', index=False)

### Data Exploration