## Preview Data

In [30]:
# import libraries
import pandas as pd
import numpy as np
from path import Path

import warnings
warnings.filterwarnings('ignore')

In [11]:
# load data
data = Path('Resources/train_strokes.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [12]:
# review data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


#### There are 43,400 observations. bmi and smoking_status columns are missing values. The attributes include demographic data, health records, and stroke outcomes.

In [13]:
# null values
df.isnull().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [14]:
# review how many patients had strokes
df["stroke"].value_counts()

0    42617
1      783
Name: stroke, dtype: int64

#### There is a significant difference between patients that suffered from a stroke (n=783), compared to those that did not, n=42,617. The dataset is extremely unbalanced.  (0= no, 1= yes)

## Data Cleaning

In [15]:
# drop id column (not needed)
df = df.drop("id", axis=1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


## BMI attribute

In [16]:
# review bmi column to check how many of the missing values are of a patient that suffered from a stroke
df[df["bmi"].isnull() & df["stroke"] == 1]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
81,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
407,Female,59.0,0,0,Yes,Private,Rural,76.15,,,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,,,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,,,1
1613,Male,58.0,0,0,Yes,Private,Rural,189.84,,,1
...,...,...,...,...,...,...,...,...,...,...,...
42530,Male,66.0,0,0,Yes,Self-employed,Urban,182.89,,never smoked,1
42839,Female,67.0,1,0,Yes,Govt_job,Urban,234.43,,never smoked,1
43007,Female,69.0,0,1,Yes,Self-employed,Rural,89.19,,smokes,1
43100,Male,67.0,0,0,Yes,Self-employed,Urban,136.79,,smokes,1


#### There are 140 stroke patients with bmi 'NaN' values, which is 17.8% of the 783 stroke patients. As the dataset is already highly unbalanced,  the 140 records are valuable to the dataset.  A way to amend the data is to replace the NaN bmi values with the mean of bmi.

In [17]:
# replacing values with the mean bmi
df['bmi'].fillna(np.round(df['bmi'].mean(), 1), inplace = True)

In [18]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [19]:
df["bmi"].isnull().values.any()

False

## Smoking status attribute 

In [20]:
# review smoking_status column to check how many of the missing values are of a patient that suffered from a stroke
df[df["smoking_status"].isnull() & df["stroke"] == 1]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
407,Female,59.0,0,0,Yes,Private,Rural,76.15,28.6,,1
426,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,,1
747,Male,78.0,0,1,Yes,Private,Urban,219.84,28.6,,1
1139,Male,57.0,0,1,No,Govt_job,Urban,217.08,28.6,,1
1315,Male,82.0,0,1,Yes,Private,Rural,208.30,32.5,,1
...,...,...,...,...,...,...,...,...,...,...,...
42110,Female,80.0,0,0,No,Private,Urban,222.87,28.6,,1
42569,Male,60.0,0,0,Yes,Private,Urban,88.57,44.6,,1
43051,Female,80.0,0,0,Yes,Self-employed,Rural,114.61,21.4,,1
43130,Female,82.0,0,1,Yes,Self-employed,Urban,118.61,29.4,,1


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             43400 non-null  object 
 1   age                43400 non-null  float64
 2   hypertension       43400 non-null  int64  
 3   heart_disease      43400 non-null  int64  
 4   ever_married       43400 non-null  object 
 5   work_type          43400 non-null  object 
 6   Residence_type     43400 non-null  object 
 7   avg_glucose_level  43400 non-null  float64
 8   bmi                43400 non-null  float64
 9   smoking_status     30108 non-null  object 
 10  stroke             43400 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 3.6+ MB


#### 13,292 records (30.6%) of the dataset  have missing values in the smoking status column. It is a considerable amount of the dataset. Instead of dropping the records, a new category named “unknown” will be created to account for all these records.

In [22]:
# Create a new category named 'unknown'
df["smoking_status"].fillna("unknown", inplace=True)

In [23]:
print(df['smoking_status'].value_counts())

never smoked       16053
unknown            13292
formerly smoked     7493
smokes              6562
Name: smoking_status, dtype: int64


In [24]:
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [25]:
# changing "object" into category type
for col in ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]:
    df[col] = df[col].astype('category')

In [26]:
# changing "object" into int type
for col in ["age"]:
    df[col] = df[col].astype("int")

In [27]:
df.dtypes

gender               category
age                     int64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object

In [28]:
# export clean dataset
df.to_csv('cleaned_stroke_dataset.csv', index=False)

In [29]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3,0,0,No,children,Rural,95.12,18.0,unknown,0
1,Male,58,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8,0,0,No,Private,Urban,110.89,17.6,unknown,0
3,Female,70,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14,0,0,No,Never_worked,Rural,161.28,19.1,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0
