# Data preprocessing

## Outlier Analysis

In [82]:
# Import packages
import pandas as pd
from matplotlib import pyplot as plt
import os
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [83]:
# Two paths specified; one for pc and another for colab

data_path = f'{Path(os.getcwd()).parent}/data/heart.csv'
# data_path = '/content/heart.csv'

In [84]:
# Read dataset
# Info and describe
df = pd.read_csv(data_path)
df.head() 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [86]:
data = df.drop('HeartDisease', axis=1) 
label = df['HeartDisease'] 

Numeric features need to be scaled, and categorical features need to be encoded.

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 79.0+ KB


In [88]:
# Scaling, normalization, etc
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [89]:
scaler = StandardScaler() 

In [90]:
scaled_data = scaler.fit_transform(data[numeric_features])

scaled_column_names = [f'scaled_{col}' for col in numeric_features]

preprocessed_df = pd.DataFrame(scaled_data, columns=scaled_column_names)

In [91]:
preprocessed_df

Unnamed: 0,scaled_Age,scaled_RestingBP,scaled_Cholesterol,scaled_MaxHR,scaled_Oldpeak
0,-1.433140,0.410909,0.825070,1.382928,-0.832432
1,-0.478484,1.491752,-0.171961,0.754157,0.105664
2,-1.751359,-0.129513,0.770188,-1.525138,-0.832432
3,-0.584556,0.302825,0.139040,-1.132156,0.574711
4,0.051881,0.951331,-0.034755,-0.581981,-0.832432
...,...,...,...,...,...
913,-0.902775,-1.210356,0.596393,-0.188999,0.293283
914,1.536902,0.627078,-0.053049,0.164684,2.357094
915,0.370100,-0.129513,-0.620168,-0.857069,0.293283
916,0.370100,-0.129513,0.340275,1.461525,-0.832432


In [92]:
cat_cols = set(data.columns).difference(set(numeric_features)) 

In [93]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[list(cat_cols)])

# Get the new column names from the encoder
encoded_columns = encoder.get_feature_names_out(list(cat_cols))  

In [94]:
encoded_columns

array(['ExerciseAngina_N', 'ExerciseAngina_Y', 'Sex_F', 'Sex_M',
       'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'ST_Slope_Down', 'ST_Slope_Flat',
       'ST_Slope_Up', 'RestingECG_LVH', 'RestingECG_Normal',
       'RestingECG_ST', 'FastingBS_0', 'FastingBS_1'], dtype=object)

In [95]:
preprocessed_df[encoded_columns] = encoded_data.toarray() 

In [96]:
preprocessed_df['HeartDisease'] = label

In [99]:
preprocessed_df

Unnamed: 0,scaled_Age,scaled_RestingBP,scaled_Cholesterol,scaled_MaxHR,scaled_Oldpeak,ExerciseAngina_N,ExerciseAngina_Y,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,FastingBS_0,FastingBS_1,HeartDisease
0,-1.433140,0.410909,0.825070,1.382928,-0.832432,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
1,-0.478484,1.491752,-0.171961,0.754157,0.105664,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
2,-1.751359,-0.129513,0.770188,-1.525138,-0.832432,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0
3,-0.584556,0.302825,0.139040,-1.132156,0.574711,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,0.051881,0.951331,-0.034755,-0.581981,-0.832432,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,-1.210356,0.596393,-0.188999,0.293283,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
914,1.536902,0.627078,-0.053049,0.164684,2.357094,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1
915,0.370100,-0.129513,-0.620168,-0.857069,0.293283,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
916,0.370100,-0.129513,0.340275,1.461525,-0.832432,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1


In [100]:
preprocessed_df.to_csv('../data/preprocessed_data.csv') 

In [98]:
# Saved preprocessed data to data/ directory with filename preprocessed_data.csv