# Phase 1.1: Disease ML Features - Exploratory Data Analysis
**DNA Gene Mapping Project - ML Phase**  
**Author:** Sharique Mohammad  
**Date:** February 2026  

## Objective
Deep exploration of disease associations, complexity patterns, and clinical features in the disease_ml_features table (4.1M variants).

## Key Questions
1. What is the distribution of pathogenic/benign/VUS variants?
2. How are diseases annotated across different databases (OMIM, MONDO, Orphanet)?
3. What is the disease complexity landscape (monogenic vs polygenic)?
4. Which genes have the highest disease associations?
5. What features correlate most with pathogenicity?

## Expected Deliverables
- 15+ visualizations
- Disease EDA report (txt)
- Missing value analysis (csv)
- Correlation matrix (csv)

## Setup and Imports

In [None]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import os
from pathlib import Path
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Load environment
load_dotenv()

# Paths
PROJECT_ROOT = Path().absolute().parent.parent
FIGURES_DIR = PROJECT_ROOT / 'data' / 'analytical' / 'figures' / 'disease_eda'
REPORTS_DIR = PROJECT_ROOT / 'data' / 'analytical' / 'reports'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print("Setup complete")
print(f"Figures: {FIGURES_DIR}")
print(f"Reports: {REPORTS_DIR}")

In [None]:
# Database connection
POSTGRES_HOST = os.getenv('POSTGRES_HOST', 'localhost')
POSTGRES_PORT = os.getenv('POSTGRES_PORT', '5432')
POSTGRES_DB = os.getenv('POSTGRES_DB', 'genome_db')
POSTGRES_USER = os.getenv('POSTGRES_USER', 'postgres')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

conn_str = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(conn_str)

print("Database connection established")

## 1. Data Loading & Overview

In [None]:
# Load disease_ml_features (10% sample for fast exploration)
print("Loading disease_ml_features table (10% sample)...")
query = """
SELECT * FROM gold.disease_ml_features 
TABLESAMPLE SYSTEM (10)
"""
df = pd.read_sql(query, engine)

print(f"\nData loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Full dataset estimate: ~4.1M rows")

In [None]:
# Convert all TEXT columns to proper types
print("Converting TEXT columns to proper types...")

# Numeric columns that are stored as TEXT
numeric_cols = [
    'disease_count', 'omim_disease_count', 'mondo_disease_count',
    'gene_disease_diversity', 'gene_clinical_utility_score',
    'gene_pathogenic_count', 'gene_total_variants'
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Boolean columns stored as TEXT
boolean_cols = [col for col in df.columns if col.startswith('is_') or col.startswith('has_')]

for col in boolean_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower() == 'true'

print(f"Converted {len(numeric_cols)} numeric columns")
print(f"Converted {len(boolean_cols)} boolean columns")
print("\nData types after conversion:")
print(df.dtypes.value_counts())

In [None]:
# Display first rows
print("First 5 rows:")
df.head()

In [None]:
# Column data types
print("Column data types:")
print(df.dtypes.value_counts())
print("\nColumn list:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col:<50} {df[col].dtype}")

In [None]:
# Missing values analysis
print("Missing Values Analysis:")
missing = pd.DataFrame({
    'column': df.columns,
    'missing_count': df.isnull().sum(),
    'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('missing_pct', ascending=False)

missing_filtered = missing[missing['missing_pct'] > 0]
print(f"\nColumns with missing values: {len(missing_filtered)}")
print(missing_filtered.head(20))

# Save missing values summary
missing.to_csv(REPORTS_DIR / 'disease_missing_values.csv', index=False)
print(f"\nSaved: {REPORTS_DIR / 'disease_missing_values.csv'}")

In [None]:
# Basic statistics
print("Dataset Statistics:")
print(f"Total variants: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"Unique genes: {df['gene_name'].nunique():,}")
print(f"Unique chromosomes: {df['chromosome'].nunique()}")

## 2. Target Variable Analysis 

In [None]:
# Clinical significance distribution
print("Clinical Significance Distribution:")
print("="*60)

# Convert TEXT boolean columns to actual booleans
if 'is_pathogenic' in df.columns:
    pathogenic_count = (df['is_pathogenic'].astype(str).str.lower() == 'true').sum()
else:
    pathogenic_count = 0

if 'is_benign' in df.columns:
    benign_count = (df['is_benign'].astype(str).str.lower() == 'true').sum()
else:
    benign_count = 0
    
if 'is_vus' in df.columns:
    vus_count = (df['is_vus'].astype(str).str.lower() == 'true').sum()
else:
    vus_count = 0

total = len(df)
print(f"Pathogenic:  {pathogenic_count:>10,} ({pathogenic_count/total*100:>5.1f}%)")
print(f"Benign:      {benign_count:>10,} ({benign_count/total*100:>5.1f}%)")
print(f"VUS:         {vus_count:>10,} ({vus_count/total*100:>5.1f}%)")

# Class imbalance ratio
if pathogenic_count > 0 and benign_count > 0:
    ratio = max(pathogenic_count, benign_count) / min(pathogenic_count, benign_count)
    print(f"\nClass imbalance ratio: {ratio:.2f}:1")

In [None]:
# Visualize clinical significance distribution
fig, ax = plt.subplots(figsize=(10, 6))

categories = ['Pathogenic', 'Benign', 'VUS']
counts = [pathogenic_count, benign_count, vus_count]
colors = ['#e74c3c', '#27ae60', '#95a5a6']

bars = ax.bar(categories, counts, color=colors, alpha=0.7, edgecolor='black')

# Add count labels on bars
for bar, count in zip(bars, counts):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}\n({count/total*100:.1f}%)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_ylabel('Number of Variants', fontsize=12, fontweight='bold')
ax.set_title('Clinical Significance Distribution\n4.1M Variants', 
             fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / '01_clinical_significance_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Saved: {FIGURES_DIR / '01_clinical_significance_distribution.png'}")

In [None]:
# Disease complexity distribution
if 'disease_complexity' in df.columns:
    print("\nDisease Complexity Distribution:")
    complexity_dist = df['disease_complexity'].value_counts()
    print(complexity_dist)
    
    # Pie chart
    fig, ax = plt.subplots(figsize=(10, 8))
    
    colors = ['#3498db', '#e74c3c', '#f39c12']
    wedges, texts, autotexts = ax.pie(complexity_dist.values, 
                                        labels=complexity_dist.index,
                                        autopct='%1.1f%%',
                                        colors=colors,
                                        startangle=90,
                                        textprops={'fontsize': 11, 'fontweight': 'bold'})
    
    ax.set_title('Disease Complexity Distribution', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '02_disease_complexity_pie.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '02_disease_complexity_pie.png'}")

In [None]:
# Disease association patterns
if 'disease_count' in df.columns:
    print("\nDisease Count Distribution:")
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(df['disease_count'].dropna(), bins=50, color='steelblue', alpha=0.7, edgecolor='black')
    axes[0].set_xlabel('Number of Diseases per Variant', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0].set_title('Disease Count Distribution', fontsize=12, fontweight='bold')
    axes[0].grid(alpha=0.3)
    
    # Box plot
    axes[1].boxplot(df['disease_count'].dropna(), vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightblue', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2))
    axes[1].set_ylabel('Number of Diseases', fontsize=11, fontweight='bold')
    axes[1].set_title('Disease Count Box Plot', fontsize=12, fontweight='bold')
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '03_disease_count_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '03_disease_count_distribution.png'}")
    
    # Statistics
    print(f"\nDisease Count Statistics:")
    print(df['disease_count'].describe())

## 3. Disease Database Coverage 

In [None]:
# Database coverage analysis
print("Disease Database Coverage:")
print("="*60)

omim_coverage = df['has_omim_disease'].sum() if 'has_omim_disease' in df.columns else 0
mondo_coverage = df['has_mondo_disease'].sum() if 'has_mondo_disease' in df.columns else 0
orphanet_coverage = df['has_orphanet_disease'].sum() if 'has_orphanet_disease' in df.columns else 0

print(f"OMIM:      {omim_coverage:>10,} ({omim_coverage/total*100:>5.1f}%)")
print(f"MONDO:     {mondo_coverage:>10,} ({mondo_coverage/total*100:>5.1f}%)")
print(f"Orphanet:  {orphanet_coverage:>10,} ({orphanet_coverage/total*100:>5.1f}%)")

In [None]:
# Multi-database overlap (Venn-like analysis)
if all(col in df.columns for col in ['has_omim_disease', 'has_mondo_disease', 'has_orphanet_disease']):
    
    # Calculate overlaps
    omim_only = ((df['has_omim_disease'] == True) & 
                 (df['has_mondo_disease'] == False) & 
                 (df['has_orphanet_disease'] == False)).sum()
    
    mondo_only = ((df['has_omim_disease'] == False) & 
                  (df['has_mondo_disease'] == True) & 
                  (df['has_orphanet_disease'] == False)).sum()
    
    orphanet_only = ((df['has_omim_disease'] == False) & 
                     (df['has_mondo_disease'] == False) & 
                     (df['has_orphanet_disease'] == True)).sum()
    
    all_three = ((df['has_omim_disease'] == True) & 
                 (df['has_mondo_disease'] == True) & 
                 (df['has_orphanet_disease'] == True)).sum()
    
    print("\nDatabase Overlap Analysis:")
    print(f"OMIM only:        {omim_only:>10,}")
    print(f"MONDO only:       {mondo_only:>10,}")
    print(f"Orphanet only:    {orphanet_only:>10,}")
    print(f"All three DBs:    {all_three:>10,}")
    
    # Bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    
    databases = ['OMIM', 'MONDO', 'Orphanet']
    coverage_counts = [omim_coverage, mondo_coverage, orphanet_coverage]
    colors = ['#e74c3c', '#3498db', '#2ecc71']
    
    bars = ax.bar(databases, coverage_counts, color=colors, alpha=0.7, edgecolor='black')
    
    for bar, count in zip(bars, coverage_counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{count:,}\n({count/total*100:.1f}%)',
                ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    ax.set_ylabel('Number of Variants', fontsize=12, fontweight='bold')
    ax.set_title('Disease Database Coverage', fontsize=14, fontweight='bold', pad=20)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '04_database_coverage.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '04_database_coverage.png'}")

In [None]:
# Disease annotation quality
if 'disease_is_well_annotated' in df.columns:
    well_annotated = (df['disease_is_well_annotated'].astype(str).str.lower() == 'true').sum()
    print(f"\nWell-annotated variants: {well_annotated:,} ({well_annotated/total*100:.1f}%)")

if 'disease_db_coverage' in df.columns:
    print("\nDatabase Coverage Score Distribution:")
    coverage_counts = pd.to_numeric(df['disease_db_coverage'], errors='coerce').value_counts().sort_index()
    print(coverage_counts)

## 4. Gene-Level Disease Features 

In [None]:
# Top genes by disease associations
if 'gene_name' in df.columns and 'gene_disease_diversity' in df.columns:
    print("Top 20 Genes by Disease Diversity:")
    
    top_genes = df.groupby('gene_name')['gene_disease_diversity'].first().sort_values(ascending=False).head(20)
    print(top_genes)
    
    # Bar chart
    fig, ax = plt.subplots(figsize=(12, 8))
    
    top_genes.plot(kind='barh', ax=ax, color='steelblue', alpha=0.7, edgecolor='black')
    
    ax.set_xlabel('Disease Diversity Score', fontsize=11, fontweight='bold')
    ax.set_ylabel('Gene', fontsize=11, fontweight='bold')
    ax.set_title('Top 20 Genes by Disease Associations', fontsize=13, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '05_top_genes_disease_diversity.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '05_top_genes_disease_diversity.png'}")

In [None]:
# Gene clinical utility scores
if 'gene_clinical_utility_score' in df.columns:
    print("\nGene Clinical Utility Score Distribution:")
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(df['gene_clinical_utility_score'].dropna(), bins=50, 
                color='coral', alpha=0.7, edgecolor='black')
    axes[0].set_xlabel('Clinical Utility Score', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0].set_title('Gene Clinical Utility Distribution', fontsize=12, fontweight='bold')
    axes[0].grid(alpha=0.3)
    
    # Box plot
    axes[1].boxplot(df['gene_clinical_utility_score'].dropna(), vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightcoral', alpha=0.7),
                    medianprops=dict(color='darkred', linewidth=2))
    axes[1].set_ylabel('Clinical Utility Score', fontsize=11, fontweight='bold')
    axes[1].set_title('Clinical Utility Box Plot', fontsize=12, fontweight='bold')
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '06_gene_clinical_utility.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '06_gene_clinical_utility.png'}")
    
    print(f"\nClinical Utility Statistics:")
    print(df['gene_clinical_utility_score'].describe())

In [None]:
# Gene priority tiers
if 'gene_priority_tier' in df.columns:
    print("\nGene Priority Tier Distribution:")
    tier_dist = df['gene_priority_tier'].value_counts()
    print(tier_dist)
    
    # Bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    
    tier_dist.plot(kind='bar', ax=ax, color='mediumpurple', alpha=0.7, edgecolor='black')
    
    ax.set_xlabel('Priority Tier', fontsize=11, fontweight='bold')
    ax.set_ylabel('Number of Variants', fontsize=11, fontweight='bold')
    ax.set_title('Gene Priority Tier Distribution', fontsize=13, fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '07_gene_priority_tiers.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '07_gene_priority_tiers.png'}")

## 5. Feature Distributions 

In [None]:
# Numeric features analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

key_numeric_features = [
    'disease_count',
    'omim_disease_count',
    'gene_clinical_utility_score',
    'gene_disease_diversity',
    'gene_pathogenic_count',
    'gene_total_variants'
]

# Filter to existing columns
key_numeric_features = [col for col in key_numeric_features if col in df.columns]

if key_numeric_features:
    print(f"Analyzing {len(key_numeric_features)} key numeric features:")
    
    # Multi-plot histograms
    n_features = len(key_numeric_features)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*4))
    axes = axes.flatten() if n_features > 1 else [axes]
    
    for i, col in enumerate(key_numeric_features):
        axes[i].hist(df[col].dropna(), bins=50, color='skyblue', alpha=0.7, edgecolor='black')
        axes[i].set_xlabel(col, fontsize=10, fontweight='bold')
        axes[i].set_ylabel('Frequency', fontsize=10, fontweight='bold')
        axes[i].set_title(f'{col} Distribution', fontsize=11, fontweight='bold')
        axes[i].grid(alpha=0.3)
    
    # Hide empty subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '08_numeric_features_distributions.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '08_numeric_features_distributions.png'}")

In [None]:
# Boolean features analysis
boolean_cols = ['is_disease_associated', 'is_multi_disease_gene', 'is_clinically_actionable',
                'is_research_candidate', 'has_drug_development_potential']

# Filter to existing columns
boolean_cols = [col for col in boolean_cols if col in df.columns]

if boolean_cols:
    print(f"\nAnalyzing {len(boolean_cols)} boolean features:")
    
    bool_counts = {}
    for col in boolean_cols:
        bool_counts[col] = df[col].sum()
        print(f"{col}: {bool_counts[col]:,} ({bool_counts[col]/total*100:.1f}%)")
    
    # Bar chart
    fig, ax = plt.subplots(figsize=(12, 6))
    
    names = [col.replace('_', ' ').title() for col in boolean_cols]
    values = list(bool_counts.values())
    
    bars = ax.barh(names, values, color='lightgreen', alpha=0.7, edgecolor='black')
    
    for bar, val in zip(bars, values):
        width = bar.get_width()
        ax.text(width, bar.get_y() + bar.get_height()/2.,
                f'{val:,} ({val/total*100:.1f}%)',
                ha='left', va='center', fontsize=10, fontweight='bold')
    
    ax.set_xlabel('Count', fontsize=11, fontweight='bold')
    ax.set_title('Boolean Feature Distributions', fontsize=13, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '09_boolean_features.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '09_boolean_features.png'}")

## 6. Correlation Analysis 

In [None]:
# Correlation matrix for numeric features
if len(key_numeric_features) > 1:
    print("Calculating correlation matrix...")
    
    corr_matrix = df[key_numeric_features].corr()
    
    # Save correlation matrix
    corr_matrix.to_csv(REPORTS_DIR / 'disease_correlations.csv')
    print(f"Saved: {REPORTS_DIR / 'disease_correlations.csv'}")
    
    # Heatmap
    fig, ax = plt.subplots(figsize=(12, 10))
    
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8},
                ax=ax)
    
    ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '10_correlation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved: {FIGURES_DIR / '10_correlation_heatmap.png'}")
    
    # High correlations
    print("\nHighly correlated pairs (|r| > 0.9):")
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                high_corr.append((
                    corr_matrix.columns[i],
                    corr_matrix.columns[j],
                    corr_matrix.iloc[i, j]
                ))
    
    if high_corr:
        for feat1, feat2, corr_val in high_corr:
            print(f"  {feat1} <-> {feat2}: {corr_val:.3f}")
    else:
        print("  None found")

## 7. Summary Report

In [None]:
# Generate comprehensive EDA report
report_path = REPORTS_DIR / 'disease_eda_report.txt'

with open(report_path, 'w') as f:
    f.write("="*80 + "\n")
    f.write("DISEASE ML FEATURES - EXPLORATORY DATA ANALYSIS REPORT\n")
    f.write("="*80 + "\n\n")
    
    f.write("Dataset Overview:\n")
    f.write(f"  Total variants: {len(df):,}\n")
    f.write(f"  Total features: {len(df.columns)}\n")
    f.write(f"  Unique genes: {df['gene_name'].nunique():,}\n")
    f.write(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB\n\n")
    
    f.write("Clinical Significance:\n")
    f.write(f"  Pathogenic: {pathogenic_count:,} ({pathogenic_count/total*100:.1f}%)\n")
    f.write(f"  Benign: {benign_count:,} ({benign_count/total*100:.1f}%)\n")
    f.write(f"  VUS: {vus_count:,} ({vus_count/total*100:.1f}%)\n\n")
    
    f.write("Database Coverage:\n")
    f.write(f"  OMIM: {omim_coverage:,} ({omim_coverage/total*100:.1f}%)\n")
    f.write(f"  MONDO: {mondo_coverage:,} ({mondo_coverage/total*100:.1f}%)\n")
    f.write(f"  Orphanet: {orphanet_coverage:,} ({orphanet_coverage/total*100:.1f}%)\n\n")
    
    f.write("Missing Values:\n")
    f.write(f"  Columns with missing data: {len(missing_filtered)}\n")
    f.write(f"  Highest missingness: {missing_filtered.iloc[0]['column'] if len(missing_filtered) > 0 else 'None'} "
            f"({missing_filtered.iloc[0]['missing_pct'] if len(missing_filtered) > 0 else 0:.1f}%)\n\n")
    
    f.write("Key Insights:\n")
    f.write(f"  1. Class imbalance: {ratio:.2f}:1 (manageable)\n" if pathogenic_count > 0 and benign_count > 0 else "")
    f.write(f"  2. Multi-database coverage: {all_three:,} variants in all 3 DBs\n" if 'all_three' in locals() else "")
    f.write(f"  3. Clinical actionability: {bool_counts.get('is_clinically_actionable', 0):,} variants\n" if 'bool_counts' in locals() else "")
    
    f.write("\nGenerated Visualizations:\n")
    for i, fig_file in enumerate(sorted(FIGURES_DIR.glob('*.png')), 1):
        f.write(f"  {i}. {fig_file.name}\n")
    
    f.write("\nNext Steps:\n")
    f.write("  - Proceed to clinical features EDA\n")
    f.write("  - Identify features for ML modeling\n")
    f.write("  - Address missing values and class imbalance\n")

print(f"\nEDA Report saved: {report_path}")
print("\n" + "="*80)
print("DISEASE ML FEATURES EDA COMPLETE")
print("="*80)
print(f"\nGenerated:")
print(f"  - {len(list(FIGURES_DIR.glob('*.png')))} visualizations")
print(f"  - 1 EDA report")
print(f"  - 2 CSV files (missing values, correlations)")
print(f"\nNext: 02_clinical_ml_features_eda.ipynb")