## Use Heart Disease [Dataset](https://github.com/cksajil/DSAIRP25/blob/main/datasets/heart_disease.csv) and answer the following questions

## 1. Find the top 5 important features to the target column

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv("heart_disease.csv")


target_col = "target"


X = df.drop(columns=[target_col])
y = df[target_col]


rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)


feature_importance = pd.Series(rf.feature_importances_, index=X.columns)


top_5 = feature_importance.sort_values(ascending=False).head(5)

print("Top 5 Important Features:")
print(top_5)


Top 5 Important Features:
cp         0.134201
thalach    0.120473
ca         0.116755
oldpeak    0.116151
thal       0.097043
dtype: float64


## 2. Perform Box-Cox Transformations to relevant features

In [4]:
from scipy.stats import boxcox

target_col = "target"

X = df.drop(columns=[target_col])
X_num = X.select_dtypes(include=[np.number])

X_boxcox = X_num.copy()
lambda_values = {}

for col in X_num.columns:
    min_val = X_num[col].min()

    if min_val <= 0:
        shifted_data = X_num[col] - min_val + 1
    else:
        shifted_data = X_num[col]

    transformed_data, lam = boxcox(shifted_data)
    X_boxcox[col] = transformed_data
    lambda_values[col] = lam

X_boxcox[target_col] = df[target_col]

print("Box-Cox Lambda Values:")
for feature, lam in lambda_values.items():
    print(f"{feature}: {lam:.4f}")

print("\nTransformed Data Sample:")
print(X_boxcox.head())


Box-Cox Lambda Values:
age: 1.5268
sex: 3.7508
cp: -0.4294
trestbps: -0.7397
chol: -0.1009
fbs: -9.5812
restecg: -0.0472
thalach: 2.1716
exang: -3.0309
oldpeak: -0.4507
slope: 1.6416
ca: -1.2242
thal: 1.8489

Transformed Data Sample:
          age       sex   cp  trestbps      chol       fbs   restecg  \
0  272.372422  3.322484  0.0  1.313871  4.138423  0.000000  0.681934   
1  280.429390  3.322484  0.0  1.316927  4.113095  0.104235  0.000000   
2  429.185698  3.322484  0.0  1.317822  4.022191  0.000000  0.681934   
3  347.725370  3.322484  0.0  1.318334  4.113095  0.000000  0.681934   
4  356.482692  0.000000  0.0  1.316553  4.325815  0.104235  0.681934   

        thalach     exang   oldpeak     slope        ca      thal  target  
0  31304.528038  0.000000  0.595324  3.089014  0.604008  6.477434       0  
1  26281.499672  0.289564  1.044062  0.000000  0.000000  6.477434       0  
2  16473.093059  0.289564  0.973148  0.000000  0.000000  6.477434       0  
3  28540.973582  0.000000  0.

## 3. Perform Feature Binning to Age Column and add it as a new column to the dataset

In [5]:
bins = [0, 30, 40, 50, 60, 100]
labels = ["<30", "30-40", "40-50", "50-60", "60+"]

df["age_binned"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

print(df[["age", "age_binned"]].head())


   age age_binned
0   52      50-60
1   53      50-60
2   70        60+
3   61        60+
4   62        60+


## 4. Find the most orthogonal feature to the 'chol' feature

In [6]:
numeric_df = df.select_dtypes(include=[np.number])

chol_corr = numeric_df.corr()["chol"].drop("chol")

most_orthogonal_feature = chol_corr.abs().idxmin()
most_orthogonal_value = chol_corr[most_orthogonal_feature]

print("Most orthogonal feature to 'chol':", most_orthogonal_feature)
print("Correlation value:", most_orthogonal_value)


Most orthogonal feature to 'chol': slope
Correlation value: -0.014247867919343115
