# Exploratory Data Analysis - Heart Disease Dataset

This notebook performs comprehensive EDA on the Heart Disease UCI dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
# Load data
df = pd.read_csv('../data/heart_disease.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
df.head()


In [None]:
# Basic information
print("Dataset Info:")
print("="*50)
df.info()
print("\n" + "="*50)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*50)
print("\nBasic Statistics:")
df.describe()


In [None]:
# Class balance
plt.figure(figsize=(8, 6))
target_counts = df['target'].value_counts()
plt.bar(['No Disease', 'Disease'], target_counts.values, color=['skyblue', 'salmon'])
plt.title('Class Distribution (Target Variable)', fontsize=16, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Heart Disease Status', fontsize=12)
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 5, str(v), ha='center', fontsize=12)
plt.tight_layout()
plt.savefig('../screenshots/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Class distribution:\n{target_counts}")
print(f"\nClass balance ratio: {target_counts[0] / target_counts[1]:.2f}")


In [None]:
# Histograms for numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col.capitalize()} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[5])
plt.tight_layout()
plt.savefig('../screenshots/feature_histograms.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 12))
correlation_matrix = df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../screenshots/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Top correlations with target
target_corr = df.corr()['target'].sort_values(ascending=False)
print("\nTop correlations with target:")
print(target_corr)


In [None]:
# Box plots for numerical features by target
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    df.boxplot(column=col, by='target', ax=axes[idx])
    axes[idx].set_title(f'{col.capitalize()} by Heart Disease', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Heart Disease (0=No, 1=Yes)')
    axes[idx].set_ylabel(col.capitalize())
    plt.suptitle('')

fig.delaxes(axes[5])
plt.tight_layout()
plt.savefig('../screenshots/boxplots_by_target.png', dpi=300, bbox_inches='tight')
plt.show()


## EDA Summary

Key findings:
1. Dataset contains 303 samples with 13 features and 1 target variable
2. Class distribution is relatively balanced
3. Some features show strong correlation with the target
4. Missing values need to be handled (if any)
5. Feature scaling will be important for distance-based algorithms
