In [1]:
import numpy as np
import pandas as pd

In [2]:
#Loading the DataFrame
heart_disease_dataset = pd.read_csv('heart disease classification dataset.csv')
del heart_disease_dataset["Unnamed: 0"]
print(heart_disease_dataset.head(5))

   age     sex  cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0   63    male   3     145.0  233.0    1        0    150.0      0      2.3   
1   37    male   2     130.0  250.0    0        1    187.0      0      3.5   
2   41  female   1     130.0  204.0    0        0    172.0      0      1.4   
3   56    male   1     120.0  236.0    0        1    178.0      0      0.8   
4   57  female   0       NaN  354.0    0        1    163.0      1      0.6   

   slope  ca  thal target  
0      0   0     1    yes  
1      0   0     2    yes  
2      2   0     2    yes  
3      2   0     2    yes  
4      2   0     2    yes  


In [3]:
#Checking missing values
print(heart_disease_dataset.isnull().sum())

age         0
sex         0
cp          0
trestbps    4
chol        1
fbs         0
restecg     0
thalach     5
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [18]:
#Handelling the Missing Values
from sklearn.impute import SimpleImputer

impute = SimpleImputer(missing_values=np.nan, strategy='mean')

impute.fit(heart_disease_dataset[["trestbps"]])

heart_disease_dataset[["trestbps"]] = impute.transform(heart_disease_dataset[["trestbps"]])




impute.fit(heart_disease_dataset[["chol"]])

heart_disease_dataset[["chol"]] = impute.transform(heart_disease_dataset[["chol"]])


heart_disease_dataset["thalach"].fillna(int(np.mean(heart_disease_dataset["thalach"])), inplace = True)

#No null values are present
print(heart_disease_dataset.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [19]:
#Encoding categorical features of sex and target column
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

heart_disease_dataset["sex"] = enc.fit_transform(heart_disease_dataset["sex"])


enc = LabelEncoder()
heart_disease_dataset["target"] = enc.fit_transform(heart_disease_dataset["target"])

#male - 1
#female - 0
print(heart_disease_dataset[["sex"]])

#yes - 1
#no - 0
print(heart_disease_dataset[["target"]])

     sex
0      1
1      1
2      0
3      1
4      0
..   ...
298    0
299    1
300    1
301    1
302    0

[303 rows x 1 columns]
     target
0         1
1         1
2         1
3         1
4         1
..      ...
298       0
299       0
300       0
301       0
302       0

[303 rows x 1 columns]


In [20]:
#Checking the sex and target columns
print(heart_disease_dataset)

     age  sex  cp    trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3  145.000000  233.0    1        0    150.0      0      2.3   
1     37    1   2  130.000000  250.0    0        1    187.0      0      3.5   
2     41    0   1  130.000000  204.0    0        0    172.0      0      1.4   
3     56    1   1  120.000000  236.0    0        1    178.0      0      0.8   
4     57    0   0  131.712375  354.0    0        1    163.0      1      0.6   
..   ...  ...  ..         ...    ...  ...      ...      ...    ...      ...   
298   57    0   0  140.000000  241.0    0        1    123.0      1      0.2   
299   45    1   3  110.000000  264.0    0        1    132.0      0      1.2   
300   68    1   0  144.000000  193.0    1        1    141.0      0      3.4   
301   57    1   0  131.712375  131.0    0        1    115.0      1      1.2   
302   57    0   1  130.000000  236.0    0        0    174.0      0      0.0   

     slope  ca  thal  target  
0        0   0     1

In [21]:
#Scaling all the values between 0-1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(heart_disease_dataset)

heart_disease_dataset_scaled = scaler.transform(heart_disease_dataset)
heart_disease_dataset_scaled = pd.DataFrame(heart_disease_dataset_scaled, columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'])
print(heart_disease_dataset_scaled)

          age  sex        cp  trestbps      chol  fbs  restecg   thalach  \
0    0.708333  1.0  1.000000  0.481132  0.244292  1.0      0.0  0.603053   
1    0.166667  1.0  0.666667  0.339623  0.283105  0.0      0.5  0.885496   
2    0.250000  0.0  0.333333  0.339623  0.178082  0.0      0.0  0.770992   
3    0.562500  1.0  0.333333  0.245283  0.251142  0.0      0.5  0.816794   
4    0.583333  0.0  0.000000  0.355777  0.520548  0.0      0.5  0.702290   
..        ...  ...       ...       ...       ...  ...      ...       ...   
298  0.583333  0.0  0.000000  0.433962  0.262557  0.0      0.5  0.396947   
299  0.333333  1.0  1.000000  0.150943  0.315068  0.0      0.5  0.465649   
300  0.812500  1.0  0.000000  0.471698  0.152968  1.0      0.5  0.534351   
301  0.583333  1.0  0.000000  0.355777  0.011416  0.0      0.5  0.335878   
302  0.583333  0.0  0.333333  0.339623  0.251142  0.0      0.0  0.786260   

     exang   oldpeak  slope    ca      thal  target  
0      0.0  0.370968    0.0  0.00

In [22]:
#Seperating features
features = heart_disease_dataset.loc[:,'trestbps':'thal']
print(features, end='\n\n\n')
print(labels, end='\n\n\n')

       trestbps   chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal
0    145.000000  233.0    1        0    150.0      0      2.3      0   0     1
1    130.000000  250.0    0        1    187.0      0      3.5      0   0     2
2    130.000000  204.0    0        0    172.0      0      1.4      2   0     2
3    120.000000  236.0    0        1    178.0      0      0.8      2   0     2
4    131.712375  354.0    0        1    163.0      1      0.6      2   0     2
..          ...    ...  ...      ...      ...    ...      ...    ...  ..   ...
298  140.000000  241.0    0        1    123.0      1      0.2      1   0     3
299  110.000000  264.0    0        1    132.0      0      1.2      1   0     3
300  144.000000  193.0    1        1    141.0      0      3.4      1   2     3
301  131.712375  131.0    0        1    115.0      1      1.2      1   1     3
302  130.000000  236.0    0        0    174.0      0      0.0      1   1     2

[303 rows x 10 columns]


     target
0         1
1