In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
#Import the dataset and print the path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirzahasnine/heart-disease-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/heart-disease-dataset


In [4]:
#Check whether it was properly imported
os.listdir(path)

NameError: name 'os' is not defined

In [5]:
# Build the full path to the CSV file
file_path = os.path.join(path, "heart_disease.csv")

# Read the CSV into a pandas DataFrame
df = pd.read_csv(file_path)

# Preview the first 5 rows
df.head()

NameError: name 'os' is not defined

## Data Cleaning Log

### Standardization
- Column names normalized (trimmed, lowercase, spaces to underscores).
- Outcome column renamed to `heart_stroke`.

### Encoding and Types
- Binary columns normalized to 0/1 (`currentSmoker`, `BPMeds`, `prevalentStroke`, `prevalentHyp`, `diabetes`, `heart_stroke`).
- Categorical fields standardized:
  - `gender` → `Male` / `Female`
  - `education` → lowercase categories
- Numeric columns coerced to numeric dtypes (`age`, `cigsPerDay`, `totChol`, `sysBP`, `diaBP`, `BMI`, `heartRate`, `glucose`).

### Duplicates
- Exact duplicate rows removed (removed: 0).

### Missing Values
- Missingness observed (percent): glucose 9.16, education 2.48, bpmeds 1.25, totchol 1.18, cigsperday 0.68, bmi 0.45, heartrate 0.02.
- Imputation rules:
  - `education`: mode
  - `bpmeds`: mode
  - numeric (`glucose`, `totchol`, `bmi`, `heartrate`, `cigsperday`): median
  - integrity rule: `currentsmoker == 0` ⇒ `cigsperday = 0`
- Post imputation missingness: 0 across all columns.

### Outliers
- Plausibility range checks applied; flagged: `sysbp` (1), `totchol` (1).
- Implausible values set to NaN and re imputed (median).



In [6]:
# =========================
# Standardize column names
# =========================
df = df.copy()

df.columns = (
    df.columns
      .astype(str)
      .str.strip()
      .str.lower()
      .str.replace(" ", "_", regex=False)
      .str.replace("__", "_", regex=False)
)

print("Columns after standardization:")
print(df.columns.tolist())

# =========================
#Rename target column to a stable name
# =========================
# Unify any variant of the outcome column into: heart_stroke
target_candidates = [c for c in df.columns if c.replace("_", "") == "heartstroke"]
if len(target_candidates) == 1:
    df = df.rename(columns={target_candidates[0]: "heart_stroke"})
elif "heart_stroke" not in df.columns:
    print("Target column not found. Check df.columns and update the rename logic.")

print("Target column:", "heart_stroke" if "heart_stroke" in df.columns else "NOT FOUND")


# =========================
# Turn the textdata into numericals
# =========================
binary_map = {
    "yes": 1, "no": 0,
    "y": 1, "n": 0,
    "true": 1, "false": 0,
    "1": 1, "0": 0
}

def normalize_binary(series: pd.Series) -> pd.Series:
    if series.dtype == "O":
        s = series.astype(str).str.strip().str.lower()
        out = s.map(binary_map)
        out[series.isna()] = np.nan
        return out
    return series

binary_cols = [
    "currentsmoker",
    "bpmeds",
    "prevalentstroke",
    "prevalenthyp",
    "diabetes",
    "heart_stroke"
]

for col in binary_cols:
    if col in df.columns:
        df[col] = normalize_binary(df[col])

print("Binary columns unique values:")
for col in binary_cols:
    if col in df.columns:
        print(col, ":", pd.unique(df[col]))


# =========================
# Clean all the categorical text fields
# =========================
if "gender" in df.columns:
    df["gender"] = df["gender"].astype(str).str.strip().str.title()
    df.loc[~df["gender"].isin(["Male", "Female"]), "gender"] = np.nan

if "education" in df.columns:
    df["education"] = df["education"].astype(str).str.strip().str.lower()
    df.loc[df["education"].isin(["nan", "none", ""]), "education"] = np.nan

print("Gender unique values:", df["gender"].dropna().unique() if "gender" in df.columns else "N/A")
print("Education unique values:", df["education"].dropna().unique() if "education" in df.columns else "N/A")


# =========================
# Properly clean the numeric fields and convert missing ad invalid to NaN
# =========================
numeric_cols = [
    "age", "cigsperday", "totchol", "sysbp", "diabp",
    "bmi", "heartrate", "glucose"
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

print("\nNumeric dtypes snapshot:")
print(df[[c for c in numeric_cols if c in df.columns]].dtypes)


# =========================
# Remove exact duplicate rows
# =========================
dup_count = df.duplicated().sum()
df = df.drop_duplicates()

print("Exact duplicates removed:", dup_count)
print("Shape after duplicate removal:", df.shape)


NameError: name 'df' is not defined

In [None]:
# =========================
# Missing value handling
# =========================
df = df.copy()

# Education: mode imputation
if df["education"].isna().any():
    edu_mode = df["education"].mode(dropna=True).iloc[0]
    df["education"] = df["education"].fillna(edu_mode)

# BPMeds: mode imputation (binary)
if df["bpmeds"].isna().any():
    bp_mode = df["bpmeds"].mode(dropna=True).iloc[0]
    df["bpmeds"] = df["bpmeds"].fillna(bp_mode)

# Numeric median imputations
median_impute_cols = ["glucose", "totchol", "bmi", "heartrate", "cigsperday"]
for col in median_impute_cols:
    if col in df.columns and df[col].isna().any():
        df[col] = df[col].fillna(df[col].median())

# Smoking integrity rule: non-smokers should have 0 cigs/day
if set(["currentsmoker", "cigsperday"]).issubset(df.columns):
    df.loc[df["currentsmoker"] == 0, "cigsperday"] = 0

# =========================
# Check for missing values
# =========================
missing_after = (df.isna().mean() * 100).sort_values(ascending=False)
missing_after = missing_after[missing_after > 0]
missing_after



In [None]:
# =========================
# Check for outliers
# =========================
ranges = {
    "age": (18, 100),
    "sysbp": (70, 260),
    "diabp": (40, 150),
    "bmi": (10, 70),
    "heartrate": (30, 220),
    "glucose": (40, 400),
    "totchol": (80, 600),
    "cigsperday": (0, 80)
}

outlier_counts = {}
for col, (lo, hi) in ranges.items():
    if col in df.columns:
        outlier_counts[col] = int(((df[col] < lo) | (df[col] > hi)).sum())

outlier_counts

# =========================
# Outlier handling. Values not making sense only
# =========================
ranges = {
    "age": (18, 100),
    "sysbp": (70, 260),
    "diabp": (40, 150),
    "bmi": (10, 70),
    "heartrate": (30, 220),
    "glucose": (40, 400),
    "totchol": (80, 600),
    "cigsperday": (0, 80)
}

# Flag and null implausible values
for col, (lo, hi) in ranges.items():
    if col in df.columns:
        mask = (df[col] < lo) | (df[col] > hi)
        df.loc[mask, col] = np.nan

# Re-impute only the columns affected by outliers (median)
for col in ["sysbp", "totchol"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Recheck outliers
outlier_counts = {}
for col, (lo, hi) in ranges.items():
    if col in df.columns:
        outlier_counts[col] = int(((df[col] < lo) | (df[col] > hi)).sum())

outlier_counts


## Feature Engineering

### Derived Features Created
- `age_bucket2`: age grouped into clinically interpretable bands (`<40`, `40–49`, `50–59`, `60+`).
- `bp_category`: blood pressure staging proxy from `sysbp` and `diabp` (`normal`, `elevated`, `stage_1_htn`, `stage_2_htn`).
- `pulse_pressure`: `sysbp - diabp` (arterial stiffness proxy).
- `map`: `diabp + (pulse_pressure / 3)` (mean arterial pressure proxy).
- `bmi_category`: BMI grouped into (`underweight`, `normal`, `overweight`, `obese`).
- Risk flags:
  - `high_chol_flag`: `totchol >= 240`
  - `high_glucose_flag`: `glucose >= 126`
  - `high_bp_flag`: `sysbp >= 140` or `diabp >= 90`
- `risk_score_simple`: composite count of selected cardiometabolic risk indicators.
- Smoking features:
  - `smoker_flag`: mirrors `currentsmoker`
  - `heavy_smoker_flag`: `currentsmoker == 1` and `cigsperday >= 20`

### Integrity Checks
- Validated `sysbp >= diabp` for derived BP features; 1 invalid record was flagged and excluded from pulse pressure/MAP derivation.

### Early Signal Summary (Outcome: `heart_stroke`)
- Strong monotonic increase in outcome rate across:
  - `age_bucket2`: <40 (0.041) → 60+ (0.277)
  - `bp_category`: normal (0.086) → stage_2_htn (0.255)
  - `risk_score_simple`: score 0 (0.081) → score 5 (0.684)
- Higher central tendency in engineered BP metrics for cases:
  - Median `pulse_pressure`: 53 (stroke) vs 46 (no stroke)
  - Median `map`: 103.67 (stroke) vs 96.67 (no stroke)



In [None]:
# =========================
#  Feature engineering
# =========================

# Age buckets (clinically interpretable bands)
df["age_bucket"] = pd.cut(
    df["age"],
    bins=[0, 39, 49, 59, 69, 120],
    labels=["<40", "40-49", "50-59", "60-69", "70+"],
    right=True
)

# Blood pressure category (hypertension staging proxy from sysbp/diabp)
def bp_category(row):
    sbp, dbp = row["sysbp"], row["diabp"]
    if pd.isna(sbp) or pd.isna(dbp):
        return np.nan
    if (sbp < 120) and (dbp < 80):
        return "normal"
    if (120 <= sbp < 130) and (dbp < 80):
        return "elevated"
    if (130 <= sbp < 140) or (80 <= dbp < 90):
        return "stage_1_htn"
    if (sbp >= 140) or (dbp >= 90):
        return "stage_2_htn"
    return np.nan

df["bp_category"] = df.apply(bp_category, axis=1)

# Pulse pressure (arterial stiffness proxy)
df["pulse_pressure"] = df["sysbp"] - df["diabp"]

# Mean arterial pressure (MAP) proxy
df["map"] = df["diabp"] + (df["pulse_pressure"] / 3)

# BMI category (risk grouping proxy)
df["bmi_category"] = pd.cut(
    df["bmi"],
    bins=[0, 18.5, 25, 30, 100],
    labels=["underweight", "normal", "overweight", "obese"],
    right=False
)

# High cholesterol flag (simple risk flag)
df["high_chol_flag"] = (df["totchol"] >= 240).astype(int)

# High glucose flag (proxy for hyperglycemia; threshold commonly used in screening contexts)
df["high_glucose_flag"] = (df["glucose"] >= 126).astype(int)

# High BP flag (proxy using sys/dia thresholds)
df["high_bp_flag"] = ((df["sysbp"] >= 140) | (df["diabp"] >= 90)).astype(int)

# Composite cardiometabolic risk flag (count style)
risk_components = ["prevalenthyp", "diabetes", "high_chol_flag", "high_glucose_flag", "high_bp_flag"]
df["risk_score_simple"] = df[risk_components].sum(axis=1)

# Smoking intensity features
df["smoker_flag"] = df["currentsmoker"].astype(int)
df["heavy_smoker_flag"] = ((df["currentsmoker"] == 1) & (df["cigsperday"] >= 20)).astype(int)

# Quick verification of new feature columns
new_features = [
    "age_bucket", "bp_category", "pulse_pressure", "map", "bmi_category",
    "high_chol_flag", "high_glucose_flag", "high_bp_flag", "risk_score_simple",
    "smoker_flag", "heavy_smoker_flag"
]
print("Engineered features created:", [c for c in new_features if c in df.columns])
df[new_features].head()


In [None]:
# =========================
# BP integrity: sysbp should be >= diabp
# =========================
bp_bad = df["sysbp"] < df["diabp"]
print("Rows with sysbp < diabp:", int(bp_bad.sum()))

# Null derived BP features for invalid BP rows
df.loc[bp_bad, ["pulse_pressure", "map"]] = np.nan

# Recompute derived BP features from valid BP rows
df.loc[~bp_bad, "pulse_pressure"] = df.loc[~bp_bad, "sysbp"] - df.loc[~bp_bad, "diabp"]
df.loc[~bp_bad, "map"] = df.loc[~bp_bad, "diabp"] + (df.loc[~bp_bad, "pulse_pressure"] / 3)

# Quick recheck
print(df["pulse_pressure"].describe())


In [None]:
# =========================
# Engineered feature signal vs target
# =========================

# Age bucket with merged tail
df["age_bucket2"] = pd.cut(
    df["age"],
    bins=[0, 39, 49, 59, 120],
    labels=["<40", "40-49", "50-59", "60+"],
    right=True
)

# Stroke rate by engineered categorical features (silences FutureWarning)
for c in ["age_bucket2", "bp_category", "bmi_category"]:
    print(f"\nHeart stroke rate by {c}:")
    print(df.groupby(c, observed=True)["heart_stroke"].mean().sort_values(ascending=False))

# Stroke rate by composite risk score
print("\nHeart stroke rate by risk_score_simple:")
print(df.groupby("risk_score_simple")["heart_stroke"].mean())

# Median comparison for engineered continuous features
print("\nMedian pulse_pressure and MAP by heart_stroke:")
print(df.groupby("heart_stroke")[["pulse_pressure", "map"]].median())


## Univariate & Multivariate Analysis (by `heart_stroke`)

### Target distribution
- `heart_stroke = 0`: 3594
- `heart_stroke = 1`: 644
- Prevalence: 0.152

### Univariate summaries
**Numeric (central tendency and dispersion)**
- Age: median 49, IQR 14
- SBP: median 128, IQR 27
- DBP: median 82, IQR 14.875
- Total cholesterol: median 234, IQR 56
- BMI: median 25.4, IQR 4.9575
- Glucose: median 78, IQR 13
- Pulse pressure: median 47, IQR 16
- MAP: median 97.33, IQR 17.5

**Categorical (counts)**
- Gender: Female 2419, Male 1819
- Education: uneducated 1825, primaryschool 1253, graduate 687, postgraduate 473
- Age buckets: 40–49 (1660), 50–59 (1333), 60+ (690), <40 (555)
- BP category: stage_1_htn (1804), normal (1033), stage_2_htn (991), elevated (410)
- BMI category: normal (1869), overweight (1773), obese (539), underweight (57)

### Cross tabulations (stroke rate by category)
- Age (`age_bucket2`): <40 (0.041), 40–49 (0.101), 50–59 (0.197), 60+ (0.277)
- Blood pressure (`bp_category`): normal (0.086), elevated (0.124), stage_1_htn (0.139), stage_2_htn (0.255)
- BMI category: normal (0.121), underweight (0.140), overweight (0.171), obese (0.195)
- Gender: Female (0.124), Male (0.189)
- High BP flag: 0 (0.106), 1 (0.234)
- Prevalent hypertension: 0 (0.109), 1 (0.247)
- Diabetes: 0 (0.146), 1 (0.367)
- High glucose flag: 0 (0.146), 1 (0.453) *(small subgroup: 86 total)*
- High cholesterol flag: 0 (0.131), 1 (0.178)
- Heavy smoker flag: 0 (0.140), 1 (0.182)
- Smoker flag: 0 (0.145), 1 (0.159)

### Multivariate cross tabulations (stroke rate)
- `age_bucket2 x bp_category`: stroke rate increases with both age band and BP stage, with the highest rates in `60+` and `stage_2_htn`.
- `gender x risk_score_simple`: stroke rate increases with risk score for both genders; rates are consistently higher among males across risk score levels.


In [None]:
# =========================
# Univariate & multivariate analysis
# =========================

# -------------------------
# 1) Target distribution
# -------------------------
df["heart_stroke"].value_counts().sort_index()


# -------------------------
# 2) Univariate: numeric (central tendency + dispersion)
# -------------------------
num_cols = [
    "age","cigsperday","totchol","sysbp","diabp","bmi","heartrate","glucose",
    "pulse_pressure","map"
]
num_cols = [c for c in num_cols if c in df.columns]

num_summary = df[num_cols].describe().T
num_summary["iqr"] = num_summary["75%"] - num_summary["25%"]
num_summary[["count","mean","std","min","25%","50%","75%","max","iqr"]]


# -------------------------
# 3) Univariate: categorical (frequency tables)
# -------------------------
cat_cols = [
    "gender","education","age_bucket2","bp_category","bmi_category",
    "smoker_flag","heavy_smoker_flag",
    "high_chol_flag","high_glucose_flag","high_bp_flag",
    "prevalentstroke","prevalenthyp","diabetes","bpmeds"
]
cat_cols = [c for c in cat_cols if c in df.columns]

cat_freq = {c: df[c].value_counts(dropna=False) for c in cat_cols}
cat_freq


# -------------------------
# 4) Cross-tabulations by target (counts + stroke rate)
# -------------------------
# Count cross-tabs
crosstabs = {c: pd.crosstab(df[c], df["heart_stroke"], dropna=False) for c in cat_cols}

# Stroke rate by category
stroke_rates = {c: df.groupby(c, observed=True)["heart_stroke"].mean().sort_values(ascending=False) for c in cat_cols}

crosstabs, stroke_rates


## Correlation & Association Analysis

### Correlation matrix (numeric features)
- Spearman correlation was used for the numeric correlation matrix to capture monotonic relationships and reduce sensitivity to skewness and outliers common in clinical variables.
- A clear BP-related correlation cluster is present: `sysbp` and `diabp` are strongly positively correlated, and both are strongly correlated with derived BP metrics (`map`, `pulse_pressure`).
- `map` shows the strongest correlations with blood pressure variables, consistent with its construction from `sysbp` and `diabp`.
- `age` shows a positive association with blood pressure-related measures, aligning with increasing cardiovascular risk with age.
- `cigsperday` shows weak correlations with most numeric clinical measures, suggesting limited linear/monotonic association at the aggregate level.

### Association with the target (`heart_stroke`)
- Visual comparisons indicate higher stroke prevalence among older individuals, supported by the stroke-rate-by-age-band plot and the upward shift in the age boxplot for `heart_stroke = 1`.
- Categorical risk groupings (age bands, BP staging, BMI categories, and composite risk indicators) display meaningful separation in stroke rates across levels.

### Key relationship visualizations
- Stroke rate increases monotonically across age bands (`<40` → `60+`).
- The age distribution for `heart_stroke = 1` is shifted upward relative to `heart_stroke = 0`, indicating age as a strong differentiator between cases and non-cases.



In [5]:
# =========================
# Correlation & Association (target = heart_stroke)
# =========================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Output folder for Kaggle downloads
OUT_DIR = "/kaggle/working/plots"
os.makedirs(OUT_DIR, exist_ok=True)

def save_plot(filename: str):
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, filename), dpi=300, bbox_inches="tight")
    plt.show()


# =========================
# 1) Correlation matrix (numeric features)
# =========================
num_cols = [
    "age","cigsperday","totchol","sysbp","diabp","bmi","heartrate","glucose",
    "pulse_pressure","map"
]
num_cols = [c for c in num_cols if c in df.columns]

corr = df[num_cols].corr(method="spearman")

plt.figure(figsize=(10, 8))
plt.imshow(corr.values)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha="right")
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar(label="Spearman correlation")
plt.title("Spearman correlation matrix (numeric features)")
save_plot("corr_matrix_spearman.png")

corr


# =========================
# 2) Point-biserial correlation (numeric vs binary target)
# =========================
pb_rows = []
y = df["heart_stroke"].astype(int)

for c in num_cols:
    x = df[c]
    mask = x.notna() & y.notna()
    r, p = stats.pointbiserialr(y[mask], x[mask].astype(float))
    pb_rows.append({"feature": c, "point_biserial_r": r, "p_value": p})

pb_table = pd.DataFrame(pb_rows).sort_values("p_value")
pb_table


# =========================
# 3) Chi-square tests (categorical vs target)
# =========================
cat_cols = [
    "gender","education","age_bucket2","bp_category","bmi_category",
    "smoker_flag","heavy_smoker_flag",
    "high_chol_flag","high_glucose_flag","high_bp_flag",
    "prevalentstroke","prevalenthyp","diabetes","bpmeds"
]
cat_cols = [c for c in cat_cols if c in df.columns]

chi_rows = []
for c in cat_cols:
    ct = pd.crosstab(df[c], df["heart_stroke"], dropna=False)
    chi2, p, dof, expected = stats.chi2_contingency(ct)
    chi_rows.append({"feature": c, "chi2": chi2, "dof": dof, "p_value": p})

chi_table = pd.DataFrame(chi_rows).sort_values("p_value")
chi_table


# =========================
# 4) Visualizations of key relationships
# =========================

# 4.1 Numeric vs target: boxplot for top point-biserial feature
top_num = pb_table.iloc[0]["feature"]
df.boxplot(column=top_num, by="heart_stroke")
plt.title(f"{top_num} by heart_stroke")
plt.suptitle("")
plt.xlabel("heart_stroke (0=no, 1=yes)")
plt.ylabel(top_num)
save_plot(f"boxplot_{top_num}_by_target.png")

# 4.2 Categorical vs target: stroke rate bar chart for top chi-square feature
top_cat = chi_table.iloc[0]["feature"]
stroke_rate = df.groupby(top_cat, observed=True)["heart_stroke"].mean().sort_values(ascending=False)

stroke_rate.plot(kind="bar")
plt.ylabel("stroke rate")
plt.title(f"Stroke rate by {top_cat}")
save_plot(f"stroke_rate_{top_cat}.png")


NameError: name 'df' is not defined