### Data Exploration

In [61]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [62]:
# read csv file
df = pd.read_csv("./Datasets/train_strokes.csv")


In [63]:
# display dataframe
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [64]:
# drop "id" column
df.drop(['id'], axis=1, inplace=True)

In [65]:
df
## key: 
## gender: 'Male':0, 'Female':1, 'Other':2
## hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
## heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
## ever_married: 0 no, 1 yes
## work_type: 'Private':4, 'Self-employed':3, 'Govt_job':2, 'children':1, 'Never_worked':0
    # one hot encoder
## residence_type: 'Urban': 0, 'Rural':1
## avg_glucose_level: '<70':0, '70-100':1, '101-125':2, '>126':3
    ## from diabetes.org
    ## don't bin, use normal distribution
## smoking_status: 'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4
    #one hot encoder
## Stroke:  0 = no stroke, 1 = stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [66]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21644
Residence_type,Urban,21756
ever_married,No,15462
ever_married,Yes,27938
gender,Other,11
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6562
smoking_status,formerly smoked,7493
smoking_status,never smoked,16053


In [67]:
# drop 'other' gender - we do not know the premise of that selection
df = df[df.gender != 'Other']
print(f"{df['gender'].value_counts()}")

Female    25665
Male      17724
Name: gender, dtype: int64


In [80]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'Never_worked', 'children': 'Never_worked'}})

In [81]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21638
Residence_type,Urban,21751
ever_married,No,15456
ever_married,Yes,27933
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6561
smoking_status,formerly smoked,7487
smoking_status,never smoked,16051
work_type,Govt_job,5438


In [82]:
# check null values
for i in df.columns:
    x = df[i].isna().value_counts()
    print("Column name is:",i,"and the amount of null values is:",x)

Column name is: gender and the amount of null values is: False    43389
Name: gender, dtype: int64
Column name is: age and the amount of null values is: False    43389
Name: age, dtype: int64
Column name is: hypertension and the amount of null values is: False    43389
Name: hypertension, dtype: int64
Column name is: heart_disease and the amount of null values is: False    43389
Name: heart_disease, dtype: int64
Column name is: ever_married and the amount of null values is: False    43389
Name: ever_married, dtype: int64
Column name is: work_type and the amount of null values is: False    43389
Name: work_type, dtype: int64
Column name is: Residence_type and the amount of null values is: False    43389
Name: Residence_type, dtype: int64
Column name is: avg_glucose_level and the amount of null values is: False    43389
Name: avg_glucose_level, dtype: int64
Column name is: bmi and the amount of null values is: False    41931
True      1458
Name: bmi, dtype: int64
Column name is: smoking_

In [69]:
# check data type
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [77]:
count_nan = df[column].isnotnull().value_counts()

print ('Count of NaN: ' + str(count_nan))

AttributeError: 'Series' object has no attribute 'isnotnull'

In [74]:
for column in df:
    print (f"{column}:" +'Count of NaN: ' + str(count_nan))

gender:Count of NaN: 0
age:Count of NaN: 0
hypertension:Count of NaN: 0
heart_disease:Count of NaN: 0
ever_married:Count of NaN: 0
work_type:Count of NaN: 0
Residence_type:Count of NaN: 0
avg_glucose_level:Count of NaN: 0
bmi:Count of NaN: 0
smoking_status:Count of NaN: 0
stroke:Count of NaN: 0


In [39]:
for column in df:
    print(df[column].isnull().value_counts())

False    43400
Name: gender, dtype: int64
False    43400
Name: age, dtype: int64
False    43400
Name: hypertension, dtype: int64
False    43400
Name: heart_disease, dtype: int64
False    43400
Name: ever_married, dtype: int64
False    43400
Name: work_type, dtype: int64
False    43400
Name: Residence_type, dtype: int64
False    43400
Name: avg_glucose_level, dtype: int64
False    41938
True      1462
Name: bmi, dtype: int64
False    30108
True     13292
Name: smoking_status, dtype: int64
False    43400
Name: stroke, dtype: int64


In [22]:
# check if there are null values
df['bmi'].isnull().values.any()

True

In [72]:
df['smoking_status'].isnull().values.any()

True

In [26]:
# drop nan values
df = df.dropna(axis=0)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
6,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
7,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
8,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [43]:
for column in df:
    print(df[column].isnull().value_counts())

False    43400
Name: gender, dtype: int64
False    43400
Name: age, dtype: int64
False    43400
Name: hypertension, dtype: int64
False    43400
Name: heart_disease, dtype: int64
False    43400
Name: ever_married, dtype: int64
False    43400
Name: work_type, dtype: int64
False    43400
Name: Residence_type, dtype: int64
False    43400
Name: avg_glucose_level, dtype: int64
False    41938
True      1462
Name: bmi, dtype: int64
False    30108
True     13292
Name: smoking_status, dtype: int64
False    43400
Name: stroke, dtype: int64


In [78]:
# check unique values
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [79]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'no_work', 'children': 'no_work'}})


In [80]:
# check unique values

df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'no_work'], dtype=object)

In [81]:
# check unique values
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [82]:
# check unique values
df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [83]:
# check unique values
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [84]:
# check unique values

df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [85]:
# make dicts
work_type_dict = {'Private':3, 'Self-employed':2, 'Govt_job':1, 'no_work':0}
smoke_dict = {'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4}
ever_married_dict = {'Yes':1, 'No':0}
resi_type_dict = {'Urban': 0, 'Rural':1}
gender_dict = {'Male':0, 'Female':1, 'Other':2}

In [86]:
# map series

df['work_type'] = df['work_type'].map(work_type_dict)
df['smoking_status'] = df['smoking_status'].map(smoke_dict)
df['ever_married'] = df['ever_married'].map(ever_married_dict)
df['Residence_type'] = df['Residence_type'].map(resi_type_dict)
df['gender'] = df['gender'].map(gender_dict)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,228.69,36.6,2,1
2,0,80.0,0,1,1,3,1,105.92,32.5,0,1
3,1,49.0,0,0,1,3,0,171.23,34.4,1,1
4,1,79.0,1,0,1,2,1,174.12,24.0,0,1
5,0,81.0,0,0,1,3,0,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,103.08,18.6,4,0
5106,1,81.0,0,0,1,2,0,125.20,40.0,0,0
5107,1,35.0,0,0,1,2,1,82.99,30.6,0,0
5108,0,51.0,0,0,1,3,1,166.29,25.6,2,0


In [87]:
# bin avg_glucose_level
avg_glucose_lvl = df['avg_glucose_level']


In [88]:
glucose_lvl_bins = [0, 69, 100, 126, 272]
glucose_lvl_labels = ['<70','70-100','101-125','>126']
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,228.69,36.6,2,1
2,0,80.0,0,1,1,3,1,105.92,32.5,0,1
3,1,49.0,0,0,1,3,0,171.23,34.4,1,1
4,1,79.0,1,0,1,2,1,174.12,24.0,0,1
5,0,81.0,0,0,1,3,0,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,103.08,18.6,4,0
5106,1,81.0,0,0,1,2,0,125.20,40.0,0,0
5107,1,35.0,0,0,1,2,1,82.99,30.6,0,0
5108,0,51.0,0,0,1,3,1,166.29,25.6,2,0


In [89]:
df['avg_glucose_level'] = pd.cut(avg_glucose_lvl, bins=glucose_lvl_bins, labels=glucose_lvl_labels)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,>126,36.6,2,1
2,0,80.0,0,1,1,3,1,101-125,32.5,0,1
3,1,49.0,0,0,1,3,0,>126,34.4,1,1
4,1,79.0,1,0,1,2,1,>126,24.0,0,1
5,0,81.0,0,0,1,3,0,>126,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,101-125,18.6,4,0
5106,1,81.0,0,0,1,2,0,101-125,40.0,0,0
5107,1,35.0,0,0,1,2,1,70-100,30.6,0,0
5108,0,51.0,0,0,1,3,1,>126,25.6,2,0


In [90]:
glucose_lvl_dict = {'<70':0,'70-100':1,'101-125':2,'>126':3}
df['avg_glucose_level'] = df['avg_glucose_level'].map(glucose_lvl_dict)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,3,36.6,2,1
2,0,80.0,0,1,1,3,1,2,32.5,0,1
3,1,49.0,0,0,1,3,0,3,34.4,1,1
4,1,79.0,1,0,1,2,1,3,24.0,0,1
5,0,81.0,0,0,1,3,0,3,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,2,18.6,4,0
5106,1,81.0,0,0,1,2,0,2,40.0,0,0
5107,1,35.0,0,0,1,2,1,1,30.6,0,0
5108,0,51.0,0,0,1,3,1,3,25.6,2,0


In [91]:
df.dtypes


gender                  int64
age                   float64
hypertension            int64
heart_disease           int64
ever_married            int64
work_type               int64
Residence_type          int64
avg_glucose_level    category
bmi                   float64
smoking_status          int64
stroke                  int64
dtype: object

In [92]:
# how many people did not have a stroke (0) vs how many people had a stroke (1)
df['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [33]:
# save csv

#df.to_csv('stroke_data_cleaned.csv', index=False)