In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv(r'C:\Users\USER\Documents\Heart_Disease Kaggle\Data\heart.csv')

print("="*80)
print("✓ PHASE 3: EXPLORATORY DATA ANALYSIS")
print("="*80)

# Encode categorical for correlation
df_corr = df.copy()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']

for col in categorical_cols:
    le = LabelEncoder()
    df_corr[col] = le.fit_transform(df_corr[col])

# Correlation with target
target_corr = df_corr.corr()['HeartDisease'].sort_values(ascending=False)

print(f"\nFeature Correlations with Heart Disease:")
print(target_corr)

print(f"\n✓ Phase 3 started successfully!")


In [None]:
import os

# Ensure output directory exists
os.makedirs('visualizations', exist_ok=True)

# Full correlation matrix
correlation_matrix = df_corr.corr()

# Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('visualizations/04_correlation_heatmap.png', dpi=300)
plt.show()


In [None]:
# Compare means
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

for col in numerical_cols:
    no_disease = df[df['HeartDisease'] == 0][col].mean()
    disease = df[df['HeartDisease'] == 1][col].mean()
    print(f"{col}: No Disease={no_disease:.2f}, Disease={disease:.2f}")

# Box plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
for idx, col in enumerate(numerical_cols):
    sns.boxplot(data=df, x='HeartDisease', y=col, ax=axes[idx//3, idx%3])
plt.tight_layout()
plt.savefig('visualizations/05_numerical_vs_target.png', dpi=300)
plt.show()


In [None]:
# Count plots
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
for idx, col in enumerate(categorical_cols):
    sns.countplot(data=df, x=col, hue='HeartDisease', ax=axes[idx//3, idx%3])
plt.tight_layout()
plt.savefig('visualizations/06_categorical_vs_target.png', dpi=300)
plt.show()


In [None]:
# Outlier function
def find_outliers_iqr(data):
    Q1, Q3 = data.quantile(0.25), data.quantile(0.75)
    IQR = Q3 - Q1
    return (data < Q1 - 1.5*IQR) | (data > Q3 + 1.5*IQR)

# Check outliers
for col in numerical_cols:
    outlier_count = find_outliers_iqr(df[col]).sum()
    print(f"{col}: {outlier_count} outliers")

# Visualize
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
for idx, col in enumerate(numerical_cols):
    axes[idx//3, idx%3].scatter(range(len(df)), df[col], alpha=0.5)
    outliers = find_outliers_iqr(df[col])
    axes[idx//3, idx%3].scatter(np.where(outliers), df[col][outliers], 
                                color='red', label='Outliers')
    axes[idx//3, idx%3].set_title(f'{col} - Outliers')
    axes[idx//3, idx%3].legend()
plt.tight_layout()
plt.savefig('visualizations/07_outlier_detection.png', dpi=300)
plt.show()


In [None]:
# Calculate statistics
for col in numerical_cols:
    skew = df[col].skew()
    kurt = df[col].kurtosis()
    print(f"{col}: Skewness={skew:.4f}, Kurtosis={kurt:.4f}")

# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
for idx, col in enumerate(numerical_cols):
    df[col].hist(bins=30, edgecolor='black', ax=axes[idx//3, idx%3], alpha=0.7)
    df[col].plot(kind='kde', ax=axes[idx//3, idx%3], secondary_y=True)
    axes[idx//3, idx%3].set_title(f'{col} Distribution')
plt.tight_layout()
plt.savefig('visualizations/08_distribution_analysis.png', dpi=300)
plt.show()


In [None]:
# Method 1: Mutual Information
from sklearn.feature_selection import mutual_info_classif

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Encode X for MI
X_encoded = X.copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])

mi_scores = mutual_info_classif(X_encoded, y, random_state=42)
mi_importance = pd.DataFrame({
    'Feature': X.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print("\nMutual Information Feature Importance:")
print(mi_importance)

# Visualize
plt.figure(figsize=(10, 6))
mi_importance.plot(x='Feature', y='MI_Score', kind='barh', legend=False)
plt.title('Feature Importance (Mutual Information)')
plt.xlabel('MI Score')
plt.tight_layout()
plt.savefig('visualizations/09_feature_importance.png', dpi=300)
plt.show()


In [None]:
# Hypothesis 1: Age differs by disease
no_disease_age = df[df['HeartDisease'] == 0]['Age']
disease_age = df[df['HeartDisease'] == 1]['Age']
t_stat, p_val = stats.ttest_ind(no_disease_age, disease_age)
print(f"H1 - Age difference: p-value = {p_val:.6f}")

# Hypothesis 2: ChestPainType associated with disease
chi2, p_val, _, _ = stats.chi2_contingency(
    pd.crosstab(df['ChestPainType'], df['HeartDisease'])
)
print(f"H2 - ChestPainType: p-value = {p_val:.6f}")

# Add more hypotheses for other key features


In [None]:
# Pairplot for top features
top_features = list(mi_importance.head(4)['Feature']) + ['HeartDisease']

sns.pairplot(df[top_features], hue='HeartDisease', diag_kind='kde', 
             plot_kws={'alpha': 0.6}, palette='Set2')
plt.tight_layout()
plt.savefig('visualizations/10_pairwise_relationships.png', dpi=300)
plt.show()


In [None]:
import os

# Prepare report content safely
mi_head = mi_importance.head(5).to_string() if 'mi_importance' in globals() else 'No MI importance available'
strongest = mi_importance.iloc[0]['Feature'] if ('mi_importance' in globals() and not mi_importance.empty) else 'N/A'

report = f"""
PHASE 3 EDA REPORT
{'='*80}

Top Features by Importance:
{mi_head}

Key Findings:
1. {strongest} is the strongest predictor
2. Multiple features show significant correlation
3. Outliers are within expected ranges
4. Data is ready for preprocessing

Recommendations:
- Use top 5 features as priority
- Encode categorical variables in Phase 4
- Handle zeros in RestingBP and Cholesterol
- Consider feature scaling for distance-based models

Next: Phase 4 - Data Preprocessing
"""

print(report)

# Save report (ensure directory exists and use UTF-8)
os.makedirs('reports', exist_ok=True)
with open('reports/phase3_eda_report.txt', 'w', encoding='utf-8') as f:
    f.write(report)

print('\n✓ Report saved: reports/phase3_eda_report.txt')


# Phase 3 — Exploratory Data Analysis (EDA) Summary

## Overview

This document summarizes the Phase 3 Exploratory Data Analysis (EDA) performed on the Heart Disease dataset (`Data/heart.csv`). The goal of Phase 3 was to understand feature distributions, relationships with the target (`HeartDisease`), detect data quality issues, and surface the most informative features to guide preprocessing and modeling.

## Dataset

- Source file: `Data/heart.csv`
- Total records: see notebook output (printed in `Phase3.ipynb`)
- Target: `HeartDisease` (binary)

## Key Findings

- Target distribution is approximately balanced — good for modeling without heavy class reweighting.
- Some numerical features contain zero values where zeros are likely invalid (e.g., `RestingBP`, `Cholesterol`). Treat these as missing during preprocessing.
- Correlation analysis and mutual information ranking highlight a small set of high-importance features to prioritize during modeling.
- A few numerical features show skew and outliers; consider winsorization or robust scaling before using distance-based models.

## Visualizations (generated)

- `visualizations/01_target_distribution.png` — count and pie chart for `HeartDisease`.
- `visualizations/02_numerical_distributions.png` — histograms for numerical features.
- `visualizations/03_categorical_distributions.png` — bar charts for categorical features.
- `visualizations/04_correlation_heatmap.png` — full feature correlation heatmap.
- `visualizations/05_numerical_vs_target.png` — boxplots of numerical features vs target.
- `visualizations/06_categorical_vs_target.png` — countplots of categorical features split by target.
- `visualizations/07_outlier_detection.png` — scatter + outlier highlights using IQR.
- `visualizations/08_distribution_analysis.png` — distributions and KDEs per numerical column.
- `visualizations/09_feature_importance.png` — mutual information feature importance.
- `visualizations/10_pairwise_relationships.png` — pairwise plots for top features.

If any of these files are missing, re-run `Notebooks/Phase3.ipynb` (ensure the `visualizations/` folder exists or is created by the notebook).

## Data Quality Issues to Fix (Phase 4)

- Replace or impute zeros in `RestingBP` and `Cholesterol` (suggestion: median imputation grouped by relevant categories, or `KNNImputer`).
- Encode categorical variables (`Sex`, `ChestPainType`, `RestingECG`, `ST_Slope`, `ExerciseAngina`) using one-hot or target encoding depending on modeling choice.
- Address skew in features such as `Cholesterol` and `Oldpeak` with log or power transforms, when appropriate.

## Feature Selection Recommendations

- Use mutual information and correlation results to shortlist the top features (see `mi_importance` in the notebook).
- Consider recursive feature elimination (RFE) or model-based selection (Tree-based feature importance) during Phase 4.

## Reproducibility — How to re-run the analysis

1. Ensure dependencies are installed (see `requirements.txt`).
2. From the workspace root, run the Phase 3 notebook (example using nbconvert):

```powershell
python -m pip install -r requirements.txt
python -m nbconvert --to notebook --execute "Notebooks\Phase3.ipynb" --output "Notebooks\Phase3_executed.ipynb"
```

3. Or open `Notebooks/Phase3.ipynb` in Jupyter / VS Code and run cells interactively.

## Files generated by Phase 3

- `reports/phase3_eda_report.txt` — textual EDA report.
- `visualizations/*` — charts listed above.
