In [354]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [355]:
#Read CSV file
df_brain= pd.read_csv("brain_stroke.csv")

In [356]:
df_brain.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [357]:
df_brain['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [358]:
df_brain['hypertension'].unique()

array([0, 1])

In [359]:
df_brain['heart_disease'].unique()

array([1, 0])

In [360]:
df_brain['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [361]:
df_brain['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [362]:
df_brain['stroke'].unique()

array([1, 0])

# Data Transformation

# 1. Dropping Columns/Features

Dropping column Id as this a unique number given to each record (can be patient id or random generated number or serial number). It has no significance in decision making.

In [363]:
df_brain = df_brain.drop('id',axis=1)

# 2. Replacing values in a cloumn

Changing work type 'Children' as 'Student' as Children is not work type and Changing formatting for other work types.
Changing binary values to string values of Yes and No instead of 1 and 0 respectively for  hypertension and heart_disease.

In [364]:
df_brain['work_type'] = df_brain['work_type'].replace(['children'], 'Student')
df_brain['work_type'] = df_brain['work_type'].replace(['Self-employed'], 'Self Employed')
df_brain['work_type'] = df_brain['work_type'].replace(['Govt_job'], 'Government')

In [365]:
df_brain['work_type'].unique()

array(['Private', 'Self Employed', 'Government', 'Student',
       'Never_worked'], dtype=object)

In [366]:
df_brain['hypertension'] = df_brain['hypertension'].replace([1], 'Yes')
df_brain['hypertension'] = df_brain['hypertension'].replace([0], 'No')

df_brain['heart_disease'] = df_brain['heart_disease'].replace([1], 'Yes')
df_brain['heart_disease'] = df_brain['heart_disease'].replace([0], 'No')


df_brain['stroke'] = df_brain['heart_disease'].replace([1], 'Yes')
df_brain['stroke'] = df_brain['heart_disease'].replace([0], 'No')

In [367]:
df_brain['hypertension'].unique()

array(['No', 'Yes'], dtype=object)

In [368]:
df_brain['heart_disease'].unique()

array(['Yes', 'No'], dtype=object)

In [369]:
df_brain['stroke'].unique()

array(['Yes', 'No'], dtype=object)

In [370]:
df_brain['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

# 3. Renaming Column Name

Changing column name stroke to brain_stroke as this is more meaningful

In [371]:
df_brain = df_brain.rename(columns={'stroke': 'brain_stroke'})

In [372]:
df_brain.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,brain_stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self Employed,Rural,202.21,,never smoked,No
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,No
4,Female,79.0,Yes,No,Yes,Self Employed,Rural,174.12,24.0,never smoked,No


In [373]:
df_brain.dtypes

gender                object
age                  float64
hypertension          object
heart_disease         object
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
brain_stroke          object
dtype: object

# 4. Transform features

As per BMI chart on National Heart, Lung, and Blood Institute website (https://www.nhlbi.nih.gov/health/educational/lose_wt/BMI/bmi_tbl2.htm), the maximum BMI is 54. So, any value above 54 is unrelaistic. So, replace values over 54 with 54 considering the person is obese.

In [374]:
np.max(df_brain['bmi'])

97.6

In [375]:
max_bmi = 54.0
df_brain['bmi'][df_brain['bmi']>max_bmi] = max_bmi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brain['bmi'][df_brain['bmi']>max_bmi] = max_bmi


In [376]:
np.max(df_brain['bmi'])

54.0

# 5. Engineer new useful features.

Adding new column stage_of_life based on the age as we can decide which age group has an impact

In [377]:
df_brain['stage_of_life'] = np.where(df_brain.age <= 2, 'Infant', 
                                 np.where(df_brain.age < 5, 'Toddler', 
                                 np.where(df_brain.age < 13, 'Child',
                                 np.where(df_brain.age < 20, 'Teen',
                                 np.where(df_brain.age < 40, 'Adult',
                                 np.where(df_brain.age < 60, 'Middle Age Adult ','Senior Adult'))))))

In [378]:
df_brain.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,brain_stroke,stage_of_life
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes,Senior Adult
1,Female,61.0,No,No,Yes,Self Employed,Rural,202.21,,never smoked,No,Senior Adult
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes,Senior Adult
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,No,Middle Age Adult
4,Female,79.0,Yes,No,Yes,Self Employed,Rural,174.12,24.0,never smoked,No,Senior Adult
5,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,No,Senior Adult
6,Male,74.0,Yes,Yes,Yes,Private,Rural,70.09,27.4,never smoked,Yes,Senior Adult
7,Female,69.0,No,No,No,Private,Urban,94.39,22.8,never smoked,No,Senior Adult
8,Female,59.0,No,No,Yes,Private,Rural,76.15,,Unknown,No,Middle Age Adult
9,Female,78.0,No,No,Yes,Private,Urban,58.57,24.2,Unknown,No,Senior Adult


I am adding one more column 'weight_status' as bmi might have different values and difficult guage details on that. So, weight_status will categorize them into 3 different categories.Instead of bmi, I feel weight_status will be best suited for analysis.

In [379]:
df_brain['weight_status'] = np.where(df_brain.bmi < 18.5, 'Underweight', 
                                 np.where(df_brain.bmi < 25.0, 'Healthy weight', 
                                 np.where(df_brain.bmi < 30, 'Overweight','Obesity')))

In [380]:
df_brain.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,brain_stroke,stage_of_life,weight_status
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes,Senior Adult,Obesity
1,Female,61.0,No,No,Yes,Self Employed,Rural,202.21,,never smoked,No,Senior Adult,Obesity
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes,Senior Adult,Obesity
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,No,Middle Age Adult,Obesity
4,Female,79.0,Yes,No,Yes,Self Employed,Rural,174.12,24.0,never smoked,No,Senior Adult,Healthy weight
5,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,No,Senior Adult,Overweight
6,Male,74.0,Yes,Yes,Yes,Private,Rural,70.09,27.4,never smoked,Yes,Senior Adult,Overweight
7,Female,69.0,No,No,No,Private,Urban,94.39,22.8,never smoked,No,Senior Adult,Healthy weight
8,Female,59.0,No,No,Yes,Private,Rural,76.15,,Unknown,No,Middle Age Adult,Obesity
9,Female,78.0,No,No,Yes,Private,Urban,58.57,24.2,Unknown,No,Senior Adult,Healthy weight


# 6. Replace Null Values

In [381]:
df_brain.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
brain_stroke           0
stage_of_life          0
weight_status          0
dtype: int64

As there are 201 null values for bmi, if we remove them, it will reduce dataset drastically. So, I am replacing null values with median.

In [382]:
df_brain['bmi'] = df_brain['bmi'].fillna(df_brain['bmi'].median())

In [383]:
df_brain.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
brain_stroke         0
stage_of_life        0
weight_status        0
dtype: int64

# 7. String Values case change

Finally, changing string values to UPPER CASE to avoid same values due to different cases falling into 2 different categories

In [391]:
df_brain = df_brain.apply(lambda x: x.astype(str).str.upper())

In [392]:
np.sort(df_brain['bmi'].unique())

array(['10.3', '11.3', '11.5', '12.0', '12.3', '12.8', '13.0', '13.2',
       '13.3', '13.4', '13.5', '13.7', '13.8', '13.9', '14.0', '14.1',
       '14.2', '14.3', '14.4', '14.5', '14.6', '14.8', '14.9', '15.0',
       '15.1', '15.2', '15.3', '15.4', '15.5', '15.6', '15.7', '15.8',
       '15.9', '16.0', '16.1', '16.2', '16.3', '16.4', '16.5', '16.6',
       '16.7', '16.8', '16.9', '17.0', '17.1', '17.2', '17.3', '17.4',
       '17.5', '17.6', '17.7', '17.8', '17.9', '18.0', '18.1', '18.2',
       '18.3', '18.4', '18.5', '18.6', '18.7', '18.8', '18.9', '19.0',
       '19.1', '19.2', '19.3', '19.4', '19.5', '19.6', '19.7', '19.8',
       '19.9', '20.0', '20.1', '20.2', '20.3', '20.4', '20.5', '20.6',
       '20.7', '20.8', '20.9', '21.0', '21.1', '21.2', '21.3', '21.4',
       '21.5', '21.6', '21.7', '21.8', '21.9', '22.0', '22.1', '22.2',
       '22.3', '22.4', '22.5', '22.6', '22.7', '22.8', '22.9', '23.0',
       '23.1', '23.2', '23.3', '23.4', '23.5', '23.6', '23.7', '23.8',
      

Final dataset after Data preparation:

In [393]:
df_brain

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,brain_stroke,stage_of_life,weight_status
0,MALE,67.0,NO,YES,YES,PRIVATE,URBAN,228.69,36.6,FORMERLY SMOKED,YES,SENIOR ADULT,OBESITY
1,FEMALE,61.0,NO,NO,YES,SELF EMPLOYED,RURAL,202.21,28.1,NEVER SMOKED,NO,SENIOR ADULT,OBESITY
2,MALE,80.0,NO,YES,YES,PRIVATE,RURAL,105.92,32.5,NEVER SMOKED,YES,SENIOR ADULT,OBESITY
3,FEMALE,49.0,NO,NO,YES,PRIVATE,URBAN,171.23,34.4,SMOKES,NO,MIDDLE AGE ADULT,OBESITY
4,FEMALE,79.0,YES,NO,YES,SELF EMPLOYED,RURAL,174.12,24.0,NEVER SMOKED,NO,SENIOR ADULT,HEALTHY WEIGHT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,FEMALE,80.0,YES,NO,YES,PRIVATE,URBAN,83.75,28.1,NEVER SMOKED,NO,SENIOR ADULT,OBESITY
5106,FEMALE,81.0,NO,NO,YES,SELF EMPLOYED,URBAN,125.2,40.0,NEVER SMOKED,NO,SENIOR ADULT,OBESITY
5107,FEMALE,35.0,NO,NO,YES,SELF EMPLOYED,RURAL,82.99,30.6,NEVER SMOKED,NO,ADULT,OBESITY
5108,MALE,51.0,NO,NO,YES,PRIVATE,RURAL,166.29,25.6,FORMERLY SMOKED,NO,MIDDLE AGE ADULT,OVERWEIGHT
