In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('./heart.csv')
df.head()

In [None]:
TARGET_COL = "HeartDisease"
df.columns = [c.strip() for c in df.columns]

# NaN the abnormal 
for col in ["Cholesterol", "RestingBP"]:
    if col in df.columns:
        df[col] = df[col].replace(0, np.nan)
if "Oldpeak" in df.columns:
    df.loc[df["Oldpeak"] < 0, "Oldpeak"] = np.nan

#for check 
print(f"count of RestiongBP <=0 :  {(df['RestingBP'] <= 0).sum()}")
print(f"count of Cholesterol <=0 :  {(df['Cholesterol'] <= 0).sum()}")
print(f"count of Oldpeak <0 :  {(df['Oldpeak'] < 0).sum()}")

# recognize column type
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != TARGET_COL]
cat_cols = [c for c in df.columns if (not pd.api.types.is_numeric_dtype(df[c])) and c != TARGET_COL]

print("Target:", TARGET_COL)
print("Numeric features:", num_cols)
print("Categorical features:", cat_cols)

count of RestiongBP <=0 :  0
count of Cholesterol <=0 :  0
count of Oldpeak <0 :  0
Target: HeartDisease
Numeric features: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
Categorical features: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [18]:
num_cols.remove('FastingBS')  # it is categorical, remove it from numerical
cat_cols.append('FastingBS')  # add it into categorical column

In [None]:
desc = df[num_cols].describe().T
print("\n[Univariate] Numeric describe:\n", desc)

In [None]:
# numerical column distribution
for c in num_cols:
    plt.figure()
    df[c].dropna().hist(bins=30)
    plt.title(f"Distribution of {c}")
    plt.xlabel(c)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

In [None]:
for c in cat_cols:
    vc = df[c].value_counts(dropna=False)
    print(f"\n[Univariate] Categorical {c} value counts:\n{vc}")
    plt.figure()
    vc.plot(kind="bar")
    plt.title(f"Count of {c}")
    plt.xlabel(c)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
corr_pearson  = df[num_cols].corr(method="pearson")
corr_spearman = df[num_cols].corr(method="spearman")

plt.figure(figsize=(11, 9))
sns.heatmap(corr_pearson, cmap="coolwarm", vmin=-1, vmax=1, square=True, cbar_kws={"shrink": .8})
plt.title("Feature–Feature Correlation (Pearson, numeric-only)")
plt.tight_layout()
plt.show()

plt.figure(figsize=(11, 9))
sns.heatmap(corr_spearman, cmap="coolwarm", vmin=-1, vmax=1, square=True, cbar_kws={"shrink": .8})
plt.title("Feature–Feature Correlation (Spearman, numeric-only)")
plt.tight_layout()
plt.show()

In [26]:
def top_k_pairs(corr_df, k=10):
    c = corr_df.abs().where(~np.eye(corr_df.shape[0], dtype=bool))  # remove diagonal
    pairs = (
        c.unstack()
         .dropna()
         .sort_values(ascending=False)
    )
    # Deduplicate symmetrical pairs (A,B) and (B,A)
    seen = set()
    top = []
    for (a, b), v in pairs.items():
        key = tuple(sorted([a, b]))
        if a == b or key in seen:
            continue
        seen.add(key)
        top.append((a, b, corr_df.loc[a, b], abs(corr_df.loc[a, b])))
        if len(top) >= k:
            break
    return pd.DataFrame(top, columns=["Feature A", "Feature B", "Pearson", "|Pearson|"])

top_pairs = top_k_pairs(corr_pearson, k=12)
print("\n[Top correlated feature pairs by |Pearson|] (numeric-only)")
print(top_pairs.to_string(index=False))


[Top correlated feature pairs by |Pearson|] (numeric-only)
  Feature A   Feature B   Pearson  |Pearson|
        Age       MaxHR -0.382045   0.382045
    Oldpeak         Age  0.263086   0.263086
  RestingBP         Age  0.263084   0.263084
      MaxHR     Oldpeak -0.180993   0.180993
    Oldpeak   RestingBP  0.151950   0.151950
      MaxHR   RestingBP -0.109693   0.109693
Cholesterol   RestingBP  0.095939   0.095939
        Age Cholesterol  0.058758   0.058758
    Oldpeak Cholesterol  0.056824   0.056824
Cholesterol       MaxHR -0.019856   0.019856
