In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.backends.backend_pdf import PdfPages

# Load the dataset
data = pd.read_csv("C:/Users/adesh/Heart-Disease-Prediction-Using-Random-Forest/data/heart.csv", encoding='utf-8')

# Path for the PDF file in the 'results' folder outside the current project folder
pdf_path = os.path.join("..", "results", "visualization.pdf")

# Create the PDF for saving visualizations
with PdfPages(pdf_path) as pdf:

    # Basic stats
    print(data.info())
    print(data.describe())
    print(data.isnull().sum())

    # Correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Correlation Heatmap")
    pdf.savefig()
    plt.close()

    # Distribution of target variable
    plt.figure(figsize=(6, 4))
    sns.countplot(x='target', data=data, hue='target', palette="Set2", legend=False)
    plt.title("Heart Disease Presence (1) vs Absence (0)")
    pdf.savefig()
    plt.close()

    # Boxplot for cholesterol levels by heart disease status
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='target', y='chol', data=data, hue='target', palette="muted", legend=False)
    plt.title("Cholesterol Levels by Heart Disease Status")
    pdf.savefig()
    plt.close()

    # Age distribution by target
    plt.figure(figsize=(6, 4))
    sns.histplot(data=data, x="age", hue="target", multiple="stack", palette="coolwarm", bins=20)
    plt.title("Age Distribution by Heart Disease Status")
    pdf.savefig()
    plt.close()

    # Scatter plot for age vs. max heart rate
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x='age', y='thalach', hue='target', data=data, palette="coolwarm", alpha=0.8)
    plt.title("Age vs. Max Heart Rate (Thalach)")
    pdf.savefig()
    plt.close()

    # Pair plot for selected features
    selected_features = ['age', 'chol', 'thalach', 'trestbps', 'target']
    sns.pairplot(data[selected_features], hue="target", palette="coolwarm", diag_kind="kde")
    plt.suptitle("Pair Plot for Key Features", y=1.02)
    pdf.savefig()
    plt.close()

    # Violin plot for cholesterol levels by target
    plt.figure(figsize=(6, 4))
    sns.violinplot(x='target', y='chol', data=data, hue='target', palette="muted", legend=False)
    plt.title("Violin Plot of Cholesterol Levels by Heart Disease Status")
    pdf.savefig()
    plt.close()

    # Feature distribution: trestbps
    plt.figure(figsize=(6, 4))
    sns.histplot(data=data, x="trestbps", hue="target", multiple="stack", palette="coolwarm", bins=20)
    plt.title("Blood Pressure Distribution by Heart Disease Status")
    pdf.savefig()
    plt.close()

print(f"All visualizations are saved in '{pdf_path}'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.366337    0.683168    0.966997  131.62376