In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [34]:
df = pd.read_csv("Heart_new2.csv")
df.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [35]:
df.shape

(4500, 18)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HeartDisease      4500 non-null   object 
 1   BMI               4500 non-null   float64
 2   Smoking           4500 non-null   object 
 3   AlcoholDrinking   4500 non-null   object 
 4   Stroke            4500 non-null   object 
 5   PhysicalHealth    4500 non-null   int64  
 6   MentalHealth      4500 non-null   int64  
 7   DiffWalking       4500 non-null   object 
 8   Sex               4500 non-null   object 
 9   AgeCategory       4500 non-null   object 
 10  Race              4500 non-null   object 
 11  Diabetic          4500 non-null   object 
 12  PhysicalActivity  4500 non-null   object 
 13  GenHealth         4500 non-null   object 
 14  SleepTime         4500 non-null   int64  
 15  Asthma            4500 non-null   object 
 16  KidneyDisease     4500 non-null   object 


In [37]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [38]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,4500.0,4500.0,4500.0,4500.0
mean,29.34004,4.105111,4.439778,7.043333
std,6.676625,8.573743,8.593363,1.595765
min,12.48,0.0,0.0,1.0
25%,24.6875,0.0,0.0,6.0
50%,28.25,0.0,0.0,7.0
75%,33.0,3.0,5.0,8.0
max,75.82,30.0,30.0,20.0


In [39]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [40]:
# Binary Encoding
df['HeartDisease'] = df['HeartDisease'].map({'Yes':1, 'No':0})
df['Smoking'] = df['Smoking'].map({'Yes':1, 'No':0})
df['AlcoholDrinking'] = df['AlcoholDrinking'].map({'Yes':1, 'No':0})
df['Stroke'] = df['Stroke'].map({'Yes':1, 'No':0})
df['DiffWalking'] = df['DiffWalking'].map({'Yes':1, 'No':0})
df['Diabetic'] = df['Diabetic'].map({'Yes':1, 'No':0})
df['PhysicalActivity'] = df['PhysicalActivity'].map({'Yes':1, 'No':0})
df['Asthma'] = df['Asthma'].map({'Yes':1, 'No':0})
df['KidneyDisease'] = df['KidneyDisease'].map({'Yes':1, 'No':0})
df['SkinCancer'] = df['SkinCancer'].map({'Yes':1, 'No':0})

In [41]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3,30,0,Female,55-59,White,1.0,1,Very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,Female,80 or older,White,0.0,1,Very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,Male,65-69,White,1.0,1,Fair,8,1,0,0
3,0,24.21,0,0,0,0,0,0,Female,75-79,White,0.0,0,Good,6,0,0,1
4,0,23.71,0,0,0,28,0,1,Female,40-44,White,0.0,1,Very good,8,0,0,0


In [43]:
# Binary Encoding for Sex column

df['Sex'] = df['Sex'].map({'Male':1, 'Female':0})

In [44]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3,30,0,0,55-59,White,1.0,1,Very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80 or older,White,0.0,1,Very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,65-69,White,1.0,1,Fair,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,75-79,White,0.0,0,Good,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,40-44,White,0.0,1,Very good,8,0,0,0


In [68]:
df['AgeCategory'].value_counts()

AgeCategory
65-69          537
60-64          469
70-74          439
55-59          394
50-54          388
75-79          320
45-49          314
80 or older    310
40-44          299
35-39          271
30-34          269
18-24          267
25-29          223
Name: count, dtype: int64

In [69]:
df['AgeCategory'].unique().tolist()

['55-59',
 '80 or older',
 '65-69',
 '75-79',
 '40-44',
 '70-74',
 '60-64',
 '50-54',
 '45-49',
 '18-24',
 '35-39',
 '30-34',
 '25-29']

In [70]:
# Ordinal Encoding on AgeCategory column 

age_mapping = {
'18-24':0, '25-29':1, '30-34':2, '35-39':3,
'40-44':4, '45-49':5, '50-54':6, '55-59':7,
'60-64':8, '65-69':9, '70-74':10, '75-79':11,
'80 or older':12
}

df['AgeCategory'].map(age_mapping)

0        8
1       13
2       10
3       12
4        5
        ..
4495    12
4496     5
4497     7
4498     7
4499    10
Name: AgeCategory, Length: 4500, dtype: int64

In [None]:
oe = OrdinalEncoder(categories = aage_order)


In [46]:
df['Race'].value_counts()

Race
White                             3171
Black                             1088
Other                               86
Hispanic                            71
American Indian/Alaskan Native      51
Asian                               33
Name: count, dtype: int64

In [51]:
df['Race'].unique().tolist()

['White',
 'Black',
 'Asian',
 'American Indian/Alaskan Native',
 'Other',
 'Hispanic']

In [47]:
df['GenHealth'].value_counts()

GenHealth
Good         1427
Very good    1416
Excellent     728
Fair          670
Poor          259
Name: count, dtype: int64

In [49]:
df['GenHealth'].unique().tolist()

['Very good', 'Fair', 'Good', 'Poor', 'Excellent']