**Processing and Transforming Data**

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# import data
df = pd.read_csv('../../data/raw/heart.csv')

In [3]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
# check for missing values
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [5]:
# check for duplicates
df.duplicated().sum()

np.int64(0)

In [6]:
# Handling Outliers
Z_Score = lambda x: (x - x.mean()) / x.std()

# Numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(["HeartDisease", "FastingBS"])

for col in num_cols:
    df[f"Z_Score_{col}"] = Z_Score(df[col])

zscore_cols = [f"Z_Score_{col}" for col in num_cols]

# check for outliers
outliers = df[(df[zscore_cols].abs() > 3).any(axis=1)]

df = df[~df.index.isin(outliers.index)]

# Drop Z-Score columns
for col in zscore_cols:
    df = df.drop(columns=[col])

In [7]:
# Save processed data
df.to_csv('../../data/processed/heart_processed.csv', index=False)

In [8]:
# Encoding Sex
df.loc[:, "Sex"] = df["Sex"].map({"M": 1, "F": 0})

In [9]:
# Encoding ChestPainType
categories = df["ChestPainType"].unique()
categories

df.loc[:, "ChestPainType"] = df["ChestPainType"].map({"ATA": 0, "NAP": 1, "ASY": 2, "TA": 3})

In [10]:
# Encoding RestingECG
categories = df["RestingECG"].unique()
categories

df.loc[:, "RestingECG"] = df["RestingECG"].map({"Normal": 0, "ST": 1, "LVH": 2})

In [11]:
# Encoding ExerciseAngina
df.loc[:, "ExerciseAngina"] = df["ExerciseAngina"].map({"N": 0, "Y": 1})

In [12]:
# Encoding ST_Slope
categories = df["ST_Slope"].unique()
categories

df.loc[:, "ST_Slope"] = df["ST_Slope"].map({"Up": 0, "Flat": 1, "Down": 2})

In [13]:
# Save processed encoded data
df.to_csv('../../data/processed/heart_processed_encoded.csv', index=False)

In [14]:
# Define the encodings for each categorical variable
encodings = {
    "Sex": {"M": 1, "F": 0},
    "ChestPainType": {"ATA": 0, "NAP": 1, "ASY": 2, "TA": 3},
    "RestingECG": {"Normal": 0, "ST": 1, "LVH": 2},
    "ExerciseAngina": {"N": 0, "Y": 1},
    "ST_Slope": {"Up": 0, "Flat": 1, "Down": 2}
}

# Save the encodings to a JSON file
with open('../../data/processed/encodings.json', 'w') as json_file:
    json.dump(encodings, json_file, indent=4)