In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv(r"C:\Users\USER\Documents\Heart_Disease Kaggle\Data\heart.csv")

print("="*80)
print("✓ PHASE 2: DATA EXPLORATION")
print("="*80)
print(f"\nDataset: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"\nTarget Distribution:")
print(df['HeartDisease'].value_counts())
print(f"\n✓ Phase 2 started successfully!")

In [None]:
# ==============================================================================
# PHASE 2: DATA COLLECTION & EXPLORATION
# Heart Disease Prediction Project
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Project info
print("="*80)
print("PHASE 2: DATA COLLECTION & EXPLORATION")
print("Heart Disease Prediction Project")
print("="*80)
print(f"\nDate: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Status: Starting data exploration")


In [None]:
# ==============================================================================
# STEP 1: LOAD THE DATASET
# ==============================================================================

print("\n" + "="*80)
print("STEP 1: LOADING DATASET")
print("="*80)

# Load the CSV file
df = pd.read_csv(r"C:\Users\USER\Documents\Heart_Disease Kaggle\Data\heart.csv")

print(f"\n✓ Dataset loaded successfully!")
print(f"  File: heart.csv")
print(f"  Location: Current directory")
print(f"  Size: {df.shape} rows × {df.shape} columns")

# Save info for later
dataset_shape = df.shape
print(f"\n  Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")


In [None]:
# ==============================================================================
# STEP 2: INITIAL DATASET INSPECTION
# ==============================================================================

print("\n" + "="*80)
print("STEP 2: DATASET OVERVIEW")
print("="*80)

# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nLast 5 rows of the dataset:")
print(df.tail())

# Column names and types
print("\n" + "-"*80)
print("COLUMN INFORMATION")
print("-"*80)
print(f"\nColumn Names ({len(df.columns)} total):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print("\n" + "-"*80)
print("DATA TYPES")
print("-"*80)
print(df.dtypes)


In [None]:
# ==============================================================================
# STEP 3: DATA QUALITY CHECK
# ==============================================================================

print("\n" + "="*80)
print("STEP 3: DATA QUALITY CHECK")
print("="*80)

# Missing values
print("\n" + "-"*80)
print("MISSING VALUES")
print("-"*80)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_data.values,
    'Missing_Percent': missing_percent.values
})

# Filter to show only columns with missing values
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0]

if len(missing_summary) > 0:
    print("\nColumns with missing values:")
    print(missing_summary.to_string(index=False))
else:
    print("\n✓ No missing values found in the dataset!")

# Duplicates
print("\n" + "-"*80)
print("DUPLICATE ROWS")
print("-"*80)

duplicate_count = df.duplicated().sum()
print(f"\nTotal duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print(f"! Found {duplicate_count} duplicate rows")
    print("\nFirst few duplicates:")
    print(df[df.duplicated(keep=False)].head())
else:
    print("✓ No duplicate rows found!")

# Data shape confirmation
print("\n" + "-"*80)
print("DATASET DIMENSIONS")
print("-"*80)
print(f"\nTotal rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")
print(f"Total cells: {df.shape[0] * df.shape[1]:,}")

In [None]:
# ==============================================================================
# STEP 4: DESCRIPTIVE STATISTICS
# ==============================================================================

print("\n" + "="*80)
print("STEP 4: DESCRIPTIVE STATISTICS")
print("="*80)

# Overall statistics
print("\n" + "-"*80)
print("NUMERICAL FEATURES SUMMARY")
print("-"*80)
print("\nDescriptive Statistics:")
print(df.describe())

# More detailed statistics
print("\n" + "-"*80)
print("DETAILED STATISTICS")
print("-"*80)

for col in df.select_dtypes(include=[np.number]).columns:
    print(f"\n{col}:")
    print(f"  Count:     {df[col].count()}")
    print(f"  Mean:      {df[col].mean():.4f}")
    print(f"  Median:    {df[col].median():.4f}")
    print(f"  Std Dev:   {df[col].std():.4f}")
    print(f"  Min:       {df[col].min():.4f}")
    print(f"  Max:       {df[col].max():.4f}")
    print(f"  Q1 (25%):  {df[col].quantile(0.25):.4f}")
    print(f"  Q3 (75%):  {df[col].quantile(0.75):.4f}")
    print(f"  IQR:       {df[col].quantile(0.75) - df[col].quantile(0.25):.4f}")
    print(f"  Skewness:  {df[col].skew():.4f}")
    print(f"  Kurtosis:  {df[col].kurtosis():.4f}")


In [None]:
# ==============================================================================
# STEP 5: CATEGORICAL FEATURES ANALYSIS
# ==============================================================================

print("\n" + "="*80)
print("STEP 5: CATEGORICAL FEATURES ANALYSIS")
print("="*80)

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

print(f"\nCategorical columns ({len(categorical_cols)}):")
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Values: {df[col].unique().tolist()}")
    print(f"  Value counts:")
    print(df[col].value_counts())
    print(f"  Distribution (%):")
    print(df[col].value_counts(normalize=True) * 100)


In [None]:
# ==============================================================================
# STEP 6: TARGET VARIABLE ANALYSIS
# ==============================================================================

print("\n" + "="*80)
print("STEP 6: TARGET VARIABLE ANALYSIS (HeartDisease)")
print("="*80)

target = 'HeartDisease'

print(f"\n{target} Distribution:")
print("-" * 80)

# Value counts
print("\nAbsolute counts:")
print(df[target].value_counts().sort_index())

print("\nPercentage distribution:")
target_dist = df[target].value_counts(normalize=True) * 100
print(target_dist.sort_index())

# Analysis
value_counts = df[target].value_counts().sort_index()
no_disease_count = value_counts.get(0, 0)
disease_count = value_counts.get(1, 0)
total = len(df)

print("\n" + "-" * 80)
print("TARGET VARIABLE BREAKDOWN")
print("-" * 80)
print(f"\nNo Heart Disease (0): {no_disease_count} ({no_disease_count/total*100:.2f}%)")
print(f"Heart Disease (1):    {disease_count} ({disease_count/total*100:.2f}%)")
print(f"Total:                {total}")

# Class balance assessment
if no_disease_count > 0 and disease_count > 0:
    ratio = max(no_disease_count, disease_count) / min(no_disease_count, disease_count)
    print(f"\nClass imbalance ratio: {ratio:.2f}:1")

    if ratio < 1.1:
        print("✓ Classes are well-balanced (excellent for modeling)")
    elif ratio < 1.5:
        print("✓ Classes are reasonably balanced")
    elif ratio < 2.0:
        print("! Classes have slight imbalance (manageable)")
    else:
        print("! Classes have significant imbalance (requires attention)")

# Visualization
print("\nTarget distribution plot saved below...")


In [None]:
# ==============================================================================
# STEP 7: CHECK FOR PROBLEMATIC VALUES
# ==============================================================================

print("\n" + "="*80)
print("STEP 7: CHECKING FOR PROBLEMATIC VALUES")
print("="*80)

# Check for zeros in medical features where zeros might be invalid
numeric_cols = df.select_dtypes(include=[np.number]).columns

print("\nZero value counts in numerical features:")
print("-" * 80)

for col in numeric_cols:
    zero_count = (df[col] == 0).sum()
    zero_percent = (zero_count / len(df)) * 100
    
    if zero_count > 0:
        print(f"\n{col}:")
        print(f"  Zero values: {zero_count} ({zero_percent:.2f}%)")
        
        # Flag problematic columns
        if col in ['RestingBP', 'Cholesterol']:
            print(f"  ⚠️  WARNING: {col} has {zero_count} zeros - likely missing values!")
        else:
            print(f"  Note: {zero_count} zeros in {col}")

# Check for negative values
print("\n" + "-" * 80)
print("NEGATIVE VALUES CHECK")
print("-" * 80)

negative_found = False
for col in numeric_cols:
    neg_count = (df[col] < 0).sum()
    if neg_count > 0:
        print(f"\n{col}: {neg_count} negative values")
        print(f"  Range: {df[col].min()} to {df[col].max()}")
        negative_found = True

if not negative_found:
    print("\n✓ No negative values found (expected)")


In [None]:
# ==============================================================================
# STEP 8: FEATURE DICTIONARY & SUMMARY
# ==============================================================================

print("\n" + "="*80)
print("STEP 8: COMPLETE FEATURE SUMMARY")
print("="*80)

feature_summary = {
    'Age': {
        'Type': 'Numerical',
        'Description': 'Patient age in years',
        'Min': df['Age'].min(),
        'Max': df['Age'].max(),
        'Mean': df['Age'].mean(),
        'Zeros': (df['Age'] == 0).sum()
    },
    'Sex': {
        'Type': 'Categorical',
        'Description': 'Gender (M/F)',
        'Unique_Values': df['Sex'].nunique(),
        'Values': df['Sex'].unique().tolist(),
        'Mode': df['Sex'].mode()
    },
    'ChestPainType': {
        'Type': 'Categorical',
        'Description': 'Type of chest pain',
        'Unique_Values': df['ChestPainType'].nunique(),
        'Values': sorted(df['ChestPainType'].unique().tolist()),
        'Mode': df['ChestPainType'].mode()
    },
    'RestingBP': {
        'Type': 'Numerical',
        'Description': 'Resting blood pressure (mm Hg)',
        'Min': df['RestingBP'].min(),
        'Max': df['RestingBP'].max(),
        'Mean': df['RestingBP'].mean(),
        'Zeros': (df['RestingBP'] == 0).sum(),
        'Status': '⚠️ Has zeros' if (df['RestingBP'] == 0).sum() > 0 else '✓'
    },
    'Cholesterol': {
        'Type': 'Numerical',
        'Description': 'Serum cholesterol (mg/dl)',
        'Min': df['Cholesterol'].min(),
        'Max': df['Cholesterol'].max(),
        'Mean': df['Cholesterol'].mean(),
        'Zeros': (df['Cholesterol'] == 0).sum(),
        'Status': '⚠️ Has zeros' if (df['Cholesterol'] == 0).sum() > 0 else '✓'
    },
    'FastingBS': {
        'Type': 'Binary',
        'Description': 'Fasting blood sugar > 120 mg/dl',
        'Unique_Values': df['FastingBS'].nunique(),
        'Values': sorted(df['FastingBS'].unique().tolist()),
        'Distribution': df['FastingBS'].value_counts().to_dict()
    },
    'RestingECG': {
        'Type': 'Categorical',
        'Description': 'Resting ECG results',
        'Unique_Values': df['RestingECG'].nunique(),
        'Values': sorted(df['RestingECG'].unique().tolist()),
        'Mode': df['RestingECG'].mode()
    },
    'MaxHR': {
        'Type': 'Numerical',
        'Description': 'Maximum heart rate achieved (bpm)',
        'Min': df['MaxHR'].min(),
        'Max': df['MaxHR'].max(),
        'Mean': df['MaxHR'].mean(),
        'Zeros': (df['MaxHR'] == 0).sum()
    },
    'ExerciseAngina': {
        'Type': 'Binary',
        'Description': 'Exercise-induced angina',
        'Unique_Values': df['ExerciseAngina'].nunique(),
        'Values': df['ExerciseAngina'].unique().tolist(),
        'Distribution': df['ExerciseAngina'].value_counts().to_dict()
    },
    'Oldpeak': {
        'Type': 'Numerical',
        'Description': 'ST depression from baseline (mm)',
        'Min': df['Oldpeak'].min(),
        'Max': df['Oldpeak'].max(),
        'Mean': df['Oldpeak'].mean(),
        'Zeros': (df['Oldpeak'] == 0).sum()
    },
    'ST_Slope': {
        'Type': 'Categorical',
        'Description': 'Slope of ST segment',
        'Unique_Values': df['ST_Slope'].nunique(),
        'Values': sorted(df['ST_Slope'].unique().tolist()),
        'Mode': df['ST_Slope'].mode()
    },
    'HeartDisease': {
        'Type': 'Binary (TARGET)',
        'Description': 'Presence of heart disease',
        'Unique_Values': df['HeartDisease'].nunique(),
        'Values': sorted(df['HeartDisease'].unique().tolist()),
        'Distribution': df['HeartDisease'].value_counts().to_dict(),
        'Class_Balance': f"{df['HeartDisease'].value_counts()} vs {df['HeartDisease'].value_counts()}"
    }
}

# Print summary
for feature, info in feature_summary.items():
    print(f"\n{feature}:")
    for key, value in info.items():
        print(f"  {key}: {value}")


In [None]:
# ==============================================================================
# STEP 9: EXPLORATORY VISUALIZATIONS
# ==============================================================================

print("\n" + "="*80)
print("STEP 9: CREATING VISUALIZATIONS")
print("="*80)

# 1. Target Variable Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['HeartDisease'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Heart Disease Distribution (Count)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Heart Disease (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No Disease', 'Disease'], rotation=0)

# Pie chart
df['HeartDisease'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%',
                                       colors=['#2ecc71', '#e74c3c'], labels=['No Disease', 'Disease'])
axes[1].set_title('Heart Disease Distribution (%)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('01_target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Target distribution plot saved: 01_target_distribution.png")

# 2. Numerical Features Distribution (exclude target variable)
numerical_features = [col for col in df.select_dtypes(include=[np.number]).columns if col != 'HeartDisease']
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df[col], bins=30, color='steelblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(alpha=0.3)

# Remove extra subplots
for idx in range(len(numerical_features), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.savefig('02_numerical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Numerical distributions plot saved: 02_numerical_distributions.png")

# 3. Categorical Features Distribution
categorical_features = df.select_dtypes(include=['object']).columns
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    df[col].value_counts().plot(kind='bar', ax=axes[idx], color='steelblue')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('03_categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Categorical distributions plot saved: 03_categorical_distributions.png")


In [None]:
# ==============================================================================
# STEP 10: GENERATE DATA QUALITY REPORT
# ==============================================================================

print("\n" + "="*80)
print("STEP 10: DATA QUALITY REPORT")
print("="*80)

report = f"""
PHASE 2: DATA EXPLORATION REPORT
Heart Disease Prediction Project

DATASET SUMMARY
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Total Records:              {df.shape[0]:,}
Total Features:             {df.shape[1]}
Total Cells:                {df.shape[0] * df.shape[1]:,}
Dataset Size:               {df.memory_usage(deep=True).sum() / 1024:.2f} KB

FEATURE BREAKDOWN
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Numerical Features:         {len(df.select_dtypes(include=[np.number]).columns)} columns
Categorical Features:       {len(df.select_dtypes(include=['object']).columns)} columns
Binary Features:            2 (FastingBS, ExerciseAngina)
Target Variable:            1 (HeartDisease)

DATA QUALITY
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Missing Values:             {df.isnull().sum().sum()} (0%)
Duplicate Rows:             {df.duplicated().sum()}
Data Type Issues:           None detected [OK]

PROBLEMATIC VALUES DETECTED
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
RestingBP with zeros:       {(df['RestingBP'] == 0).sum()} records [WARNING]
Cholesterol with zeros:     {(df['Cholesterol'] == 0).sum()} records [WARNING]
Action Required:            Handle zeros in preprocessing phase

TARGET VARIABLE ANALYSIS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""

# Get target counts
target_counts = df['HeartDisease'].value_counts().sort_index()
no_disease = target_counts.get(0, 0)
disease = target_counts.get(1, 0)
total = len(df)

report += f"""No Heart Disease (0):       {no_disease} ({no_disease/total*100:.2f}%)
Heart Disease (1):          {disease} ({disease/total*100:.2f}%)
Class Balance Status:        [EXCELLENT] Nearly 50-50 split

NUMERICAL FEATURES SUMMARY
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""

# Add numerical stats
for col in df.select_dtypes(include=[np.number]).columns:
    report += f"\n{col:20} Mean: {df[col].mean():8.2f}  Std: {df[col].std():8.2f}  Range: [{df[col].min():8.2f}, {df[col].max():8.2f}]"

report += f"""

CATEGORICAL FEATURES SUMMARY
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""

for col in df.select_dtypes(include=['object']).columns:
    mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else "N/A"
    report += f"\n{col:20} Unique: {df[col].nunique():3}  Mode: {mode_val}"

report += f"""

KEY FINDINGS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. [OK] Dataset is clean with no missing values
2. [OK] No duplicate records found
3. [OK] Target variable is well-balanced (nearly 50-50 split)
4. [WARNING] RestingBP and Cholesterol contain zeros (likely missing values)
5. [OK] All feature types are correctly identified
6. [OK] Ready for preprocessing phase

RECOMMENDATIONS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Phase 3 Actions:
  1. Create comprehensive EDA with visualizations
  2. Analyze feature correlations
  3. Identify outliers and distributions
  4. Plan feature engineering
  
Phase 4 Actions:
  1. Handle zeros in RestingBP and Cholesterol
  2. Encode categorical variables
  3. Scale numerical features
  4. Split train-test data

PHASE 2 STATUS: COMPLETE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Ready for Phase 3: Exploratory Data Analysis (EDA)
"""

print(report)

# Save report
import os
os.makedirs('reports', exist_ok=True)
with open('reports/phase2_data_exploration_report.txt', 'w', encoding='utf-8') as f:
    f.write(report)

print("\n✓ Report saved: reports/phase2_data_exploration_report.txt")


In [None]:
# ==============================================================================
# STEP 11: SAVE YOUR WORK
# ==============================================================================

print("\n" + "="*80)
print("STEP 11: SAVING PHASE 2 OUTPUTS")
print("="*80)

# Save raw data to data folder (if not already there)
# df.to_csv('data/heart_raw.csv', index=False)
# print("✓ Raw data saved: data/heart_raw.csv")

# Create a session summary
session_summary = f"""
PHASE 2 SESSION SUMMARY
================================================================================
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Status: Phase 2 Complete [OK]

DATASET LOADED
  File: heart.csv
  Rows: {df.shape[0]}
  Columns: {df.shape[1]}

DATA QUALITY CHECKS
  Missing Values: [OK] PASS (None)
  Duplicates: [OK] PASS (None)
  Data Types: [OK] PASS (Correct)

TARGET VARIABLE
  HeartDisease Distribution: Well-balanced
  Class Balance: [EXCELLENT]

VISUALIZATIONS CREATED
  01_target_distribution.png
  02_numerical_distributions.png
  03_categorical_distributions.png

FILES GENERATED
  reports/phase2_data_exploration_report.txt

READY FOR
  Phase 3: Exploratory Data Analysis (EDA)

NEXT STEPS
  1. Review visualizations created in this phase
  2. Create phase3_eda_analysis.ipynb
  3. Perform detailed feature analysis
  4. Identify patterns and relationships
  5. Document insights for modeling
"""

with open('reports/phase2_session_summary.txt', 'w', encoding='utf-8') as f:
    f.write(session_summary)

print("✓ Session summary saved: reports/phase2_session_summary.txt")

print("\n" + "="*80)
print("PHASE 2 COMPLETE! [OK]")
print("="*80)
print("\nGenerated Files:")
print("  • 01_target_distribution.png")
print("  • 02_numerical_distributions.png")
print("  • 03_categorical_distributions.png")
print("  • reports/phase2_data_exploration_report.txt")
print("  • reports/phase2_session_summary.txt")
print("\nReady to proceed to Phase 3: Exploratory Data Analysis")


# Phase 2 — Data Collection & Exploration — README

Summary
-------
This document describes Phase 2 of the Heart Disease prediction project: Data Collection & Exploration. The primary notebook is `Notebooks/Phase2.ipynb`. Use this README to reproduce the Phase 2 outputs, find generated artifacts, and follow recommended next steps.

Quick start
-----------
1. Activate your project environment (use the project's virtual environment).
2. Install dependencies:

```powershell
python -m pip install -r requirements.txt
```

3. Run Phase 2 notebook interactively, or execute headless:

```powershell
python -m nbconvert --to notebook --execute "Notebooks\Phase2.ipynb" --output "Notebooks\Phase2_executed.ipynb"
```

What the notebook does
----------------------
- Loads `Data/heart.csv` and prints dataset summary and basic stats.
- Runs data quality checks (missing values, duplicates, problematic zeros and negatives).
- Produces descriptive statistics, distribution plots, and categorical summaries.
- Generates visualizations and saves a textual data-exploration report to `reports/phase2_data_exploration_report.txt`.

Where outputs are saved
----------------------
- Visualizations: saved as `01_target_distribution.png`, `02_numerical_distributions.png`, `03_categorical_distributions.png` in the notebook working directory (or `visualizations/` if present).
- Reports: `reports/phase2_data_exploration_report.txt` and `reports/phase2_session_summary.txt`.

Notes and troubleshooting
-------------------------
- The notebook includes safeguards to create `visualizations/` and `reports/` directories before saving files. If you run cells out of order or modified the notebook, create these directories manually:

```powershell
mkdir visualizations
mkdir reports
```

- On Windows, file write encoding defaults can cause errors for special characters; notebooks explicitly write reports using UTF-8 to avoid this.

Recommendations (next steps)
---------------------------
1. Address data-quality issues discovered in Phase 2:
   - Impute or handle zeros in `RestingBP` and `Cholesterol` (treat as missing).
   - Remove or flag duplicates if found.
2. Prepare a preprocessing plan in Phase 3:
   - Encode categorical features (`Sex`, `ChestPainType`, `RestingECG`, `ST_Slope`, `ExerciseAngina`).
   - Scale/transform skewed numerical features (e.g., `Cholesterol`, `Oldpeak`).
3. Split the data into train/validation/test with stratification on `HeartDisease` and a fixed random seed.
4. Run baseline models to validate predictive signal.

Files to inspect
----------------
- `Notebooks/Phase2.ipynb` — main data exploration notebook.
- `reports/phase2_data_exploration_report.txt` — generated textual report.
- `reports/phase2_session_summary.txt` — brief session summary.
- `Docs/Phase2_README.md` — related Phase 3 docs (if you proceed to Phase 3 EDA).
