In [371]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [329]:
df = pd.read_csv("Heart_new2.csv")
df.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [330]:
df.shape

(4500, 18)

In [331]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HeartDisease      4500 non-null   object 
 1   BMI               4500 non-null   float64
 2   Smoking           4500 non-null   object 
 3   AlcoholDrinking   4500 non-null   object 
 4   Stroke            4500 non-null   object 
 5   PhysicalHealth    4500 non-null   int64  
 6   MentalHealth      4500 non-null   int64  
 7   DiffWalking       4500 non-null   object 
 8   Sex               4500 non-null   object 
 9   AgeCategory       4500 non-null   object 
 10  Race              4500 non-null   object 
 11  Diabetic          4500 non-null   object 
 12  PhysicalActivity  4500 non-null   object 
 13  GenHealth         4500 non-null   object 
 14  SleepTime         4500 non-null   int64  
 15  Asthma            4500 non-null   object 
 16  KidneyDisease     4500 non-null   object 


In [332]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [333]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,4500.0,4500.0,4500.0,4500.0
mean,29.34004,4.105111,4.439778,7.043333
std,6.676625,8.573743,8.593363,1.595765
min,12.48,0.0,0.0,1.0
25%,24.6875,0.0,0.0,6.0
50%,28.25,0.0,0.0,7.0
75%,33.0,3.0,5.0,8.0
max,75.82,30.0,30.0,20.0


In [334]:
df.columns.tolist()

['HeartDisease',
 'BMI',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'PhysicalHealth',
 'MentalHealth',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'SleepTime',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [335]:
# Binary Encoding

yes_no_columns = ['HeartDisease','Smoking','AlcoholDrinking','Stroke','DiffWalking',
                  'PhysicalActivity','Asthma','KidneyDisease','SkinCancer']

for col in yes_no_columns:
    df[col] = df[col].map({'Yes':1, 'No':0}) 

In [336]:
# Binary Encoding for Sex column

df['Sex'] = df['Sex'].map({'Male':1, 'Female':0})

In [337]:
# Diabetic column have 4 unique category so we treat yes and no only

df['Diabetic'].unique().tolist()

['Yes', 'No', 'No, borderline diabetes', 'Yes (during pregnancy)']

In [338]:
# Binary Encoding for Diabetic column

df['Diabetic'] = df['Diabetic'].map({'Yes':1, 'No':0, 'Yes (during pregnancy)': 1, 'No, borderline diabetes':0})

In [339]:
df['AgeCategory'].value_counts()

AgeCategory
65-69          537
60-64          469
70-74          439
55-59          394
50-54          388
75-79          320
45-49          314
80 or older    310
40-44          299
35-39          271
30-34          269
18-24          267
25-29          223
Name: count, dtype: int64

In [340]:
df['AgeCategory'].unique().tolist()

['55-59',
 '80 or older',
 '65-69',
 '75-79',
 '40-44',
 '70-74',
 '60-64',
 '50-54',
 '45-49',
 '18-24',
 '35-39',
 '30-34',
 '25-29']

In [341]:
# Manually Ordinal Encoding on AgeCategory column

# age_mapping = {
# '18-24':0, '25-29':1, '30-34':2, '35-39':3,
# '40-44':4, '45-49':5, '50-54':6, '55-59':7,
# '60-64':8, '65-69':9, '70-74':10, '75-79':11,
# '80 or older':12
# }

# df['AgeCategory'] = df['AgeCategory'].map(age_mapping)

In [342]:
# Ordinal Encoding on AgeCategory column through sklearn

age_order = ['18-24', '25-29', '30-34', '35-39','40-44', '45-49', '50-54', '55-59',
'60-64', '65-69', '70-74', '75-79','80 or older']

oe = OrdinalEncoder(categories=[age_order])
df['AgeCategory'] = oe.fit_transform(df[['AgeCategory']])

In [343]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3,30,0,0,7.0,White,1,1,Very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,12.0,White,0,1,Very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,9.0,White,1,1,Fair,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,11.0,White,0,0,Good,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,4.0,White,0,1,Very good,8,0,0,0


In [344]:
df['GenHealth'].value_counts()

GenHealth
Good         1427
Very good    1416
Excellent     728
Fair          670
Poor          259
Name: count, dtype: int64

In [345]:
df['GenHealth'].unique().tolist()

['Very good', 'Fair', 'Good', 'Poor', 'Excellent']

In [346]:
# Manually Ordinal Encoding on GenHealth column

genhealth_mapping = {
    'Poor':0,
    'Fair':1,
    'Good':2,
    'Very good':3,
    'Excellent':4
}

df['GenHealth'] = df['GenHealth'].map(genhealth_mapping)

In [347]:
df['Race'].value_counts()

Race
White                             3171
Black                             1088
Other                               86
Hispanic                            71
American Indian/Alaskan Native      51
Asian                               33
Name: count, dtype: int64

In [348]:
df['Race'].unique().tolist()

['White',
 'Black',
 'Asian',
 'American Indian/Alaskan Native',
 'Other',
 'Hispanic']

In [349]:
# We apply OneHotEncoding on Race Column because it is nominal (no order)

'''
In OneHotEncoding
column_categories = n
So,
   column_1 + column_2 + ... + column_n = 1
drop='first' it means remove one dummy column because we can find value of that column from rest of the columns
( that_column = 1 - rest_of_the_columns )
'''


ohe = OneHotEncoder(sparse_output=False, drop='first')

race_encoded = ohe.fit_transform(df[['Race']])

race_df = pd.DataFrame(
    race_encoded,
    columns=ohe.get_feature_names_out(['Race'])
)

df = pd.concat([df.drop(columns=['Race']), race_df], axis=1)

In [350]:
df.columns.tolist()

['HeartDisease',
 'BMI',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'PhysicalHealth',
 'MentalHealth',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'SleepTime',
 'Asthma',
 'KidneyDisease',
 'SkinCancer',
 'Race_Asian',
 'Race_Black',
 'Race_Hispanic',
 'Race_Other',
 'Race_White']

In [351]:
df.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.6,1,0,0,3,30,0,0,7.0,...,3,5,1,0,1,0.0,0.0,0.0,0.0,1.0
1,0,20.34,0,0,1,0,0,0,0,12.0,...,3,7,0,0,0,0.0,0.0,0.0,0.0,1.0
2,0,26.58,1,0,0,20,30,0,1,9.0,...,1,8,1,0,0,0.0,0.0,0.0,0.0,1.0
3,0,24.21,0,0,0,0,0,0,0,11.0,...,2,6,0,0,1,0.0,0.0,0.0,0.0,1.0
4,0,23.71,0,0,0,28,0,1,0,4.0,...,3,8,0,0,0,0.0,0.0,0.0,0.0,1.0


In [352]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HeartDisease      4500 non-null   int64  
 1   BMI               4500 non-null   float64
 2   Smoking           4500 non-null   int64  
 3   AlcoholDrinking   4500 non-null   int64  
 4   Stroke            4500 non-null   int64  
 5   PhysicalHealth    4500 non-null   int64  
 6   MentalHealth      4500 non-null   int64  
 7   DiffWalking       4500 non-null   int64  
 8   Sex               4500 non-null   int64  
 9   AgeCategory       4500 non-null   float64
 10  Diabetic          4500 non-null   int64  
 11  PhysicalActivity  4500 non-null   int64  
 12  GenHealth         4500 non-null   int64  
 13  SleepTime         4500 non-null   int64  
 14  Asthma            4500 non-null   int64  
 15  KidneyDisease     4500 non-null   int64  
 16  SkinCancer        4500 non-null   int64  


In [None]:
'''
Handle Outlier in Continuous numeric Columns
1. BMI
2. SleepTime
3. PhysicalHealth
4. MentalHealth
'''

In [354]:
# Now Check outlier in BMI Column

df['BMI'].describe()

count    4500.000000
mean       29.340040
std         6.676625
min        12.480000
25%        24.687500
50%        28.250000
75%        33.000000
max        75.820000
Name: BMI, dtype: float64

In [355]:
# Check IQR (Inter Quartilie Range) of BMI Column

Q1 = df['BMI'].quantile(0.25)
Q3 = df['BMI'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound: ", lower_bound)
print("Upper Bound: ", upper_bound)

Lower Bound:  12.21875
Upper Bound:  45.46875


In [356]:
# Handle Outlier by Capping (If BMI value out of IQR range then it keep value inside range)

df['BMI'] = df['BMI'].clip(lower=12)
df['BMI'] = df['BMI'].clip(upper=45)

In [357]:
df['BMI'].describe()

count    4500.000000
mean       29.209409
std         6.255785
min        12.480000
25%        24.687500
50%        28.250000
75%        33.000000
max        45.000000
Name: BMI, dtype: float64

In [358]:
# Now Check outlier in SleepTime Column

df['SleepTime'].describe()

count    4500.000000
mean        7.043333
std         1.595765
min         1.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        20.000000
Name: SleepTime, dtype: float64

In [359]:
# Check IQR (Inter Quartilie Range) of SleepTime Column

Q1 = df['SleepTime'].quantile(0.25)
Q3 = df['SleepTime'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound: ", lower_bound)
print("Upper Bound: ", upper_bound)

Lower Bound:  3.0
Upper Bound:  11.0


In [360]:
# Handle Outlier by Capping (If SleepTime value out of IQR range then it keep value inside range)

df['SleepTime'] = df['SleepTime'].clip(lower=3)
df['SleepTime'] = df['SleepTime'].clip(upper=11)

In [361]:
df['SleepTime'].describe()

count    4500.000000
mean        7.009778
std         1.440806
min         3.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        11.000000
Name: SleepTime, dtype: float64

In [362]:
# Now Check outlier in PhysicalHealth Column

df['PhysicalHealth'].describe()

count    4500.000000
mean        4.105111
std         8.573743
min         0.000000
25%         0.000000
50%         0.000000
75%         3.000000
max        30.000000
Name: PhysicalHealth, dtype: float64

In [363]:
# Check IQR (Inter Quartilie Range) of PhysicalHealth Column

Q1 = df['PhysicalHealth'].quantile(0.25)
Q3 = df['PhysicalHealth'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound: ", lower_bound)
print("Upper Bound: ", upper_bound)

Lower Bound:  -4.5
Upper Bound:  7.5


In [364]:
# Handle Outlier by Capping (If PhysicalHealth value out of IQR range then it keep value inside range)

df['PhysicalHealth'] = df['PhysicalHealth'].clip(lower=-4.5)
df['PhysicalHealth'] = df['PhysicalHealth'].clip(upper=7.5)

In [365]:
df['PhysicalHealth'].describe()

count    4500.000000
mean        1.786333
std         2.890527
min         0.000000
25%         0.000000
50%         0.000000
75%         3.000000
max         7.500000
Name: PhysicalHealth, dtype: float64

In [366]:
# Now Check outlier in MentalHealth Column

df['MentalHealth'].describe()

count    4500.000000
mean        4.439778
std         8.593363
min         0.000000
25%         0.000000
50%         0.000000
75%         5.000000
max        30.000000
Name: MentalHealth, dtype: float64

In [367]:
# Check IQR (Inter Quartilie Range) of MentalHealth Column

Q1 = df['MentalHealth'].quantile(0.25)
Q3 = df['MentalHealth'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound: ", lower_bound)
print("Upper Bound: ", upper_bound)

Lower Bound:  -7.5
Upper Bound:  12.5


In [368]:
# Handle Outlier by Capping (If MentalHealth value out of IQR range then it keep value inside range)

df['MentalHealth'] = df['MentalHealth'].clip(lower=-7.5)
df['MentalHealth'] = df['MentalHealth'].clip(upper=12.5)

In [369]:
df['MentalHealth'].describe()

count    4500.000000
mean        2.838222
std         4.589742
min         0.000000
25%         0.000000
50%         0.000000
75%         5.000000
max        12.500000
Name: MentalHealth, dtype: float64

In [370]:
# Export the cleand preprocessed dataset

df.to_csv("heart_data_clean_preprocessed.csv", index=False)