In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Load dataset

In [14]:
df = pd.read_csv("heart.csv")

## Drop duplicates

In [15]:
df = df.drop_duplicates()

## Define features and target

In [16]:
X = df.drop("target", axis=1)
y = df["target"]


## Define categorical and numerical columns

In [17]:
categorical_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
numerical_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]

## Preprocessor: scale numericals, one-hot encode categoricals

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

# Fit and transform


In [19]:
X_processed = preprocessor.fit_transform(X)


## Get new column names

In [20]:
encoded_cols = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_features)
all_features = numerical_features + list(encoded_cols)


## Convert to DataFrame


In [21]:
X_processed_df = pd.DataFrame(X_processed, columns=all_features)

## Add target back

In [22]:
processed_df = pd.concat([X_processed_df, y.reset_index(drop=True)], axis=1)

In [23]:
print(processed_df.head())
print("\nFinal shape:", processed_df.shape)

        age  trestbps      chol   thalach   oldpeak  sex_1  cp_1  cp_2  cp_3  \
0 -0.267966 -0.376556 -0.667728  0.806035 -0.037124    1.0   0.0   0.0   0.0   
1 -0.157260  0.478910 -0.841918  0.237495  1.773958    1.0   0.0   0.0   0.0   
2  1.724733  0.764066 -1.403197 -1.074521  1.342748    1.0   0.0   0.0   0.0   
3  0.728383  0.935159 -0.841918  0.499898 -0.899544    1.0   0.0   0.0   0.0   
4  0.839089  0.364848  0.919336 -1.905464  0.739054    0.0   0.0   0.0   0.0   

   fbs_1  ...  slope_1  slope_2  ca_1  ca_2  ca_3  ca_4  thal_1  thal_2  \
0    0.0  ...      0.0      1.0   0.0   1.0   0.0   0.0     0.0     0.0   
1    1.0  ...      0.0      0.0   0.0   0.0   0.0   0.0     0.0     0.0   
2    0.0  ...      0.0      0.0   0.0   0.0   0.0   0.0     0.0     0.0   
3    0.0  ...      0.0      1.0   1.0   0.0   0.0   0.0     0.0     0.0   
4    1.0  ...      1.0      0.0   0.0   0.0   1.0   0.0     0.0     1.0   

   thal_3  target  
0     1.0       0  
1     1.0       0  
2     1.