# Phase 1.2: Clinical ML Features - Exploratory Data Analysis
**DNA Gene Mapping Project - ML Phase**  
**Author:** Sharique Mohammad  
**Date:** February 2026  

## Objective
Analyze clinical pathogenicity features, variant classification, review quality, and conservation scores.

## Data Source
- Table: `clinical_ml_features`
- Rows: ~4.1M variants (10% sample for EDA)
- Focus: Clinical significance, review status, conservation scores

## Deliverables
- 15+ visualizations
- Clinical EDA report (txt)
- Missing value analysis (csv)
- Correlation matrix (csv)

## Setup

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from pathlib import Path
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('Set2')

# Paths
PROJECT_ROOT = Path().absolute().parent.parent
FIGURES_DIR = PROJECT_ROOT / 'data' / 'analytical' / 'figures' / 'clinical_eda'
REPORTS_DIR = PROJECT_ROOT / 'data' / 'analytical' / 'reports'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Random seed
np.random.seed(42)

print("Setup complete")
print(f"Figures: {FIGURES_DIR}")
print(f"Reports: {REPORTS_DIR}")

In [None]:
# Database connection
load_dotenv()

POSTGRES_HOST = os.getenv('POSTGRES_HOST', 'localhost')
POSTGRES_PORT = os.getenv('POSTGRES_PORT', '5432')
POSTGRES_DB = os.getenv('POSTGRES_DB', 'genome_db')
POSTGRES_USER = os.getenv('POSTGRES_USER', 'postgres')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

conn_str = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(conn_str)

print("Database connection established")

## 1. Data Loading & Type Conversion

In [None]:
# Load clinical_ml_features (10% sample)
print("Loading clinical_ml_features (10% sample)...")

query = """
SELECT * FROM gold.clinical_ml_features 
TABLESAMPLE SYSTEM (10)
"""

df = pd.read_sql(query, engine)

print(f" Loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Clinical significance distribution
print("Clinical Significance Distribution:")
print("="*60)

# CRITICAL: Convert target columns to boolean FIRST
target_cols = ['target_is_pathogenic', 'target_is_benign', 'target_is_vus']

for col in target_cols:
    if col in df.columns:
        df[col] = df[col].fillna('false').astype(str).str.lower().str.strip()
        df[col] = df[col].map({
            'true': True, 't': True, '1': True, 'yes': True,
            'false': False, 'f': False, '0': False, 'no': False, '': False
        }).fillna(False)

# NOW do the sum (on actual booleans)
pathogenic_count = int(df['target_is_pathogenic'].sum()) if 'target_is_pathogenic' in df.columns else 0
benign_count = int(df['target_is_benign'].sum()) if 'target_is_benign' in df.columns else 0
vus_count = int(df['target_is_vus'].sum()) if 'target_is_vus' in df.columns else 0

total = len(df)

sig_data = pd.DataFrame({
    'Category': ['Pathogenic', 'Benign', 'VUS'],
    'Count': [pathogenic_count, benign_count, vus_count],
    'Percentage': [
        pathogenic_count/total*100,
        benign_count/total*100,
        vus_count/total*100
    ]
})

print(sig_data.to_string(index=False))

if pathogenic_count > 0 and benign_count > 0:
    ratio = max(pathogenic_count, benign_count) / min(pathogenic_count, benign_count)
    print(f"\nClass imbalance ratio: {ratio:.2f}:1")

In [None]:
# CRITICAL: Convert TEXT columns to proper types
print("Converting TEXT columns to proper data types...")

# Numeric columns stored as TEXT in PostgreSQL
numeric_cols = [
    'review_quality_score', 'number_submitters', 'review_confidence_score',
    'pathogenicity_score', 'conservation_score', 'combined_pathogenicity_risk',
    'phylop_score', 'cadd_score', 'gerp_score', 'phastcons_score',
    'gene_conservation_score', 'allele_frequency', 'gnomad_af', 'cadd_phred'
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Boolean columns - handle multiple possible formats
boolean_cols = [col for col in df.columns if col.startswith('is_') or col.startswith('has_')]

for col in boolean_cols:
    if col in df.columns:
        # Convert to string first, handle nulls
        df[col] = df[col].fillna('false').astype(str).str.lower().str.strip()
        # Map various boolean representations
        df[col] = df[col].map({
            'true': True, 't': True, '1': True, 'yes': True,
            'false': False, 'f': False, '0': False, 'no': False, '': False
        }).fillna(False)

print(f"Converted {len([c for c in numeric_cols if c in df.columns])} numeric columns")
print(f"Converted {len(boolean_cols)} boolean columns")

# Verify conversions
print("\nVerification:")
print(f"  is_pathogenic TRUE count: {df['is_pathogenic'].sum() if 'is_pathogenic' in df.columns else 'N/A'}")
print(f"  is_benign TRUE count: {df['is_benign'].sum() if 'is_benign' in df.columns else 'N/A'}")
print(f"  is_vus TRUE count: {df['is_vus'].sum() if 'is_vus' in df.columns else 'N/A'}")

print("\nData types after conversion:")
print(df.dtypes.value_counts())

In [None]:
# DIAGNOSTIC: Verify conversion actually worked
print("POST-CONVERSION DIAGNOSTIC:")
print("="*60)

# Check target columns specifically
target_cols = ['target_is_pathogenic', 'target_is_benign', 'target_is_vus']
for col in target_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Dtype: {df[col].dtype}")
        print(f"  Unique values: {df[col].unique()}")
        print(f"  Value counts:\n{df[col].value_counts()}")
        print(f"  TRUE count: {df[col].sum()}")

# Check review_status
if 'review_status' in df.columns:
    print(f"\nreview_status:")
    print(f"  Dtype: {df[col].dtype}")
    print(f"  Value counts:\n{df['review_status'].value_counts()}")

# Check conservation scores
conservation_cols = ['phylop_score', 'cadd_score', 'gerp_score', 'phastcons_score']
for col in conservation_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Dtype: {df[col].dtype}")
        print(f"  Non-null count: {df[col].notna().sum()}")
        print(f"  Mean: {df[col].mean():.3f}" if df[col].notna().any() else "  All NULL")

In [None]:
# Display first rows
print("First 5 rows:")
display(df.head())

In [None]:
# Missing value analysis
missing = pd.DataFrame({
    'column': df.columns,
    'missing_count': df.isnull().sum(),
    'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('missing_pct', ascending=False)

print("Missing Values Summary (Top 20):")
print(missing.head(20).to_string(index=False))

# Save
missing.to_csv(REPORTS_DIR / 'clinical_missing_values.csv', index=False)
print(f"\nSaved: {REPORTS_DIR / 'clinical_missing_values.csv'}")

## 2. Clinical Significance Analysis

In [None]:
# Clinical significance distribution
print("Clinical Significance Distribution:")
print("="*60)

# Count each category (already converted to boolean)
pathogenic_count = df['is_pathogenic'].sum() if 'is_pathogenic' in df.columns else 0
likely_pathogenic_count = df['is_likely_pathogenic'].sum() if 'is_likely_pathogenic' in df.columns else 0
vus_count = df['is_vus'].sum() if 'is_vus' in df.columns else 0
likely_benign_count = df['is_likely_benign'].sum() if 'is_likely_benign' in df.columns else 0
benign_count = df['is_benign'].sum() if 'is_benign' in df.columns else 0

total = len(df)

sig_data = pd.DataFrame({
    'Category': ['Pathogenic', 'Likely Pathogenic', 'VUS', 'Likely Benign', 'Benign'],
    'Count': [pathogenic_count, likely_pathogenic_count, vus_count, likely_benign_count, benign_count],
    'Percentage': [
        pathogenic_count/total*100,
        likely_pathogenic_count/total*100,
        vus_count/total*100,
        likely_benign_count/total*100,
        benign_count/total*100
    ]
})

print(sig_data.to_string(index=False))

In [None]:
# Visualization: 3-class significance distribution
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['#e74c3c', '#27ae60', '#95a5a6']
bars = ax.bar(sig_data['Category'], sig_data['Count'], color=colors, alpha=0.8, edgecolor='black')

# Add labels
for bar, pct in zip(bars, sig_data['Percentage']):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{pct:.1f}%',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_ylabel('Variant Count', fontsize=12, fontweight='bold')
ax.set_title('Clinical Significance Distribution (3 Classes)', fontsize=14, fontweight='bold')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x):,}'))
plt.tight_layout()

plt.savefig(FIGURES_DIR / '01_clinical_significance_3class.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Saved: {FIGURES_DIR / '01_clinical_significance_3class.png'}")

In [None]:
# Binary classification: Pathogenic vs Benign (excluding VUS)
binary_pathogenic = pathogenic_count
binary_benign = benign_count
binary_total = binary_pathogenic + binary_benign

if binary_total > 0:
    print(f"\nBinary Classification (Pathogenic vs Benign):")
    print(f"  Pathogenic: {binary_pathogenic:,} ({binary_pathogenic/binary_total*100:.1f}%)")
    print(f"  Benign:     {binary_benign:,} ({binary_benign/binary_total*100:.1f}%)")
    
    if binary_pathogenic > 0 and binary_benign > 0:
        ratio = max(binary_pathogenic, binary_benign) / min(binary_pathogenic, binary_benign)
        print(f"  Class imbalance: {ratio:.2f}:1")
    
    # Pie chart
    fig, ax = plt.subplots(figsize=(8, 8))
    
    wedges, texts, autotexts = ax.pie(
        [binary_pathogenic, binary_benign],
        labels=['Pathogenic', 'Benign'],
        autopct='%1.1f%%',
        colors=['#e74c3c', '#27ae60'],
        startangle=90,
        textprops={'fontsize': 12, 'fontweight': 'bold'}
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
    
    ax.set_title('Binary Classification: Pathogenic vs Benign', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    
    plt.savefig(FIGURES_DIR / '02_binary_classification.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '02_binary_classification.png'}")
else:
    print("\nNo pathogenic/benign variants for binary classification")

## 3. Review Quality Analysis

In [None]:
# Review status distribution
if 'review_status' in df.columns:
    print("Review Status Distribution:")
    review_dist = df['review_status'].value_counts()
    review_pct = df['review_status'].value_counts(normalize=True) * 100
    
    review_df = pd.DataFrame({
        'Review Status': review_dist.index,
        'Count': review_dist.values,
        'Percentage': review_pct.values.round(2)
    })
    
    print(review_df.to_string(index=False))
    
    # Bar chart
    fig, ax = plt.subplots(figsize=(12, 6))
    
    bars = ax.barh(review_df['Review Status'], review_df['Count'], color='steelblue', alpha=0.7)
    
    for bar, count in zip(bars, review_df['Count']):
        width = bar.get_width()
        ax.text(width, bar.get_y() + bar.get_height()/2.,
                f'{int(count):,}',
                ha='left', va='center', fontsize=10)
    
    ax.set_xlabel('Variant Count', fontsize=12, fontweight='bold')
    ax.set_title('Review Status Distribution', fontsize=14, fontweight='bold')
    plt.tight_layout()
    
    plt.savefig(FIGURES_DIR / '03_review_status.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '03_review_status.png'}")

In [None]:
# Review quality score distribution (0-4 scale)
if 'review_quality_score' in df.columns:
    print("\nReview Quality Score Distribution (0-4 scale):")
    
    quality_scores = df['review_quality_score'].dropna()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(quality_scores, bins=20, color='coral', edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Review Quality Score', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0].set_title('Review Quality Score Distribution', fontsize=12, fontweight='bold')
    axes[0].axvline(quality_scores.median(), color='red', linestyle='--', linewidth=2,
                    label=f'Median: {quality_scores.median():.1f}')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Box plot
    axes[1].boxplot(quality_scores, vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightcoral', alpha=0.7),
                    medianprops=dict(color='darkred', linewidth=2))
    axes[1].set_ylabel('Review Quality Score', fontsize=11, fontweight='bold')
    axes[1].set_title('Review Quality Box Plot', fontsize=12, fontweight='bold')
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '04_review_quality_score.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '04_review_quality_score.png'}")
    
    # Statistics
    print(f"\nReview Quality Statistics:")
    print(quality_scores.describe())
    
    # High quality evidence
    high_quality = (quality_scores >= 2).sum()
    print(f"\nHigh quality evidence (score >= 2): {high_quality:,} ({high_quality/len(quality_scores)*100:.1f}%)")

In [None]:
# Number of submitters analysis
if 'number_submitters' in df.columns:
    print("\nNumber of Submitters Distribution:")
    
    submitters = df['number_submitters'].dropna()
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.hist(submitters, bins=50, color='teal', edgecolor='black', alpha=0.7)
    ax.set_xlabel('Number of Submitters', fontsize=11, fontweight='bold')
    ax.set_ylabel('Variant Count', fontsize=11, fontweight='bold')
    ax.set_title('Distribution of Submitters per Variant', fontsize=12, fontweight='bold')
    ax.axvline(submitters.median(), color='red', linestyle='--', linewidth=2,
               label=f'Median: {submitters.median():.0f}')
    ax.legend()
    ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '05_number_submitters.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '05_number_submitters.png'}")
    
    print(f"\nSubmitters Statistics:")
    print(submitters.describe())

## 4. Conservation Scores Analysis

In [None]:
    # Conservation scores distribution
conservation_cols = ['phylop_score', 'cadd_score', 'gerp_score', 'phastcons_score']
available_conservation = [col for col in conservation_cols if col in df.columns]

if available_conservation:
    print(f"Analyzing {len(available_conservation)} conservation scores...")
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, col in enumerate(available_conservation[:4]):
        data = df[col].dropna()
        
        axes[idx].hist(data, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_xlabel(col.replace('_', ' ').title(), fontsize=11, fontweight='bold')
        axes[idx].set_ylabel('Count', fontsize=11, fontweight='bold')
        axes[idx].set_title(f'{col.replace("_", " ").title()} Distribution', fontsize=12, fontweight='bold')
        axes[idx].axvline(data.median(), color='red', linestyle='--', linewidth=2,
                         label=f'Median: {data.median():.2f}')
        axes[idx].legend()
        axes[idx].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '06_conservation_scores.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '06_conservation_scores.png'}")
    
    # Statistics
    print(f"\nConservation Score Statistics:")
    for col in available_conservation:
        data = df[col].dropna()
        print(f"\n{col}:")
        print(f"  Mean: {data.mean():.3f}")
        print(f"  Median: {data.median():.3f}")
        print(f"  Std: {data.std():.3f}")

In [None]:
# Conservation levels (highly conserved vs not conserved)
if 'is_highly_conserved' in df.columns:
    highly_conserved = df['is_highly_conserved'].sum()
    moderately_conserved = df['is_moderately_conserved'].sum() if 'is_moderately_conserved' in df.columns else 0
    not_conserved = len(df) - highly_conserved - moderately_conserved
    
    print(f"\nConservation Levels:")
    print(f"  Highly conserved:     {highly_conserved:,} ({highly_conserved/len(df)*100:.1f}%)")
    print(f"  Moderately conserved: {moderately_conserved:,} ({moderately_conserved/len(df)*100:.1f}%)")
    print(f"  Not conserved:        {not_conserved:,} ({not_conserved/len(df)*100:.1f}%)")
    
    # Pie chart
    fig, ax = plt.subplots(figsize=(10, 8))
    
    colors = ['#e74c3c', '#f39c12', '#95a5a6']
    wedges, texts, autotexts = ax.pie(
        [highly_conserved, moderately_conserved, not_conserved],
        labels=['Highly Conserved', 'Moderately Conserved', 'Not Conserved'],
        autopct='%1.1f%%',
        colors=colors,
        startangle=90,
        textprops={'fontsize': 11, 'fontweight': 'bold'}
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
    
    ax.set_title('Conservation Level Distribution', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    
    plt.savefig(FIGURES_DIR / '07_conservation_levels.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '07_conservation_levels.png'}")

## 5. Pathogenicity Scoring

In [None]:
# Pathogenicity score distribution
if 'pathogenicity_score' in df.columns:
    print("Pathogenicity Score Analysis:")
    
    path_scores = df['pathogenicity_score'].dropna()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(path_scores, bins=50, color='crimson', edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Pathogenicity Score', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Count', fontsize=11, fontweight='bold')
    axes[0].set_title('Pathogenicity Score Distribution', fontsize=12, fontweight='bold')
    axes[0].axvline(path_scores.median(), color='darkred', linestyle='--', linewidth=2,
                    label=f'Median: {path_scores.median():.2f}')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Cumulative distribution
    axes[1].hist(path_scores, bins=100, cumulative=True, density=True, 
                 color='crimson', edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('Pathogenicity Score', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Cumulative Probability', fontsize=11, fontweight='bold')
    axes[1].set_title('Cumulative Distribution', fontsize=12, fontweight='bold')
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '08_pathogenicity_score.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '08_pathogenicity_score.png'}")
    
    print(f"\nPathogenicity Score Statistics:")
    print(path_scores.describe())

In [None]:
# Combined pathogenicity risk (0-10 scale)
if 'combined_pathogenicity_risk' in df.columns:
    print("\nCombined Pathogenicity Risk (0-10 scale):")
    
    risk_scores = df['combined_pathogenicity_risk'].dropna()
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.hist(risk_scores, bins=20, color='orangered', edgecolor='black', alpha=0.7)
    ax.set_xlabel('Combined Risk Score (0-10)', fontsize=11, fontweight='bold')
    ax.set_ylabel('Variant Count', fontsize=11, fontweight='bold')
    ax.set_title('Combined Pathogenicity Risk Distribution', fontsize=12, fontweight='bold')
    ax.axvline(risk_scores.median(), color='darkred', linestyle='--', linewidth=2,
               label=f'Median: {risk_scores.median():.1f}')
    ax.legend()
    ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / '09_combined_risk.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '09_combined_risk.png'}")
    
    # Risk categories
    low_risk = (risk_scores < 3).sum()
    medium_risk = ((risk_scores >= 3) & (risk_scores < 7)).sum()
    high_risk = (risk_scores >= 7).sum()
    
    print(f"\nRisk Categories:")
    print(f"  Low (0-3):    {low_risk:,} ({low_risk/len(risk_scores)*100:.1f}%)")
    print(f"  Medium (3-7): {medium_risk:,} ({medium_risk/len(risk_scores)*100:.1f}%)")
    print(f"  High (7-10):  {high_risk:,} ({high_risk/len(risk_scores)*100:.1f}%)")

## 6. Feature Correlation

In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Remove ID columns
numeric_cols = [col for col in numeric_cols if 'id' not in col.lower()]

# Key features for correlation
key_features = [
    'review_quality_score', 'number_submitters', 'pathogenicity_score',
    'phylop_score', 'cadd_score', 'gerp_score', 'conservation_score',
    'combined_pathogenicity_risk'
]

available_features = [f for f in key_features if f in df.columns]

if len(available_features) > 1:
    print(f"Computing correlation matrix for {len(available_features)} features...")
    
    corr_matrix = df[available_features].corr()
    
    # Save
    corr_matrix.to_csv(REPORTS_DIR / 'clinical_correlations.csv')
    print(f"Saved: {REPORTS_DIR / 'clinical_correlations.csv'}")
    
    # Heatmap
    fig, ax = plt.subplots(figsize=(12, 10))
    
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
                center=0, square=True, linewidths=1,
                cbar_kws={"shrink": 0.8}, ax=ax)
    
    ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    
    plt.savefig(FIGURES_DIR / '10_correlation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Saved: {FIGURES_DIR / '10_correlation_heatmap.png'}")
    
    # High correlations
    print("\nHighly Correlated Pairs (|r| > 0.9):")
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                high_corr.append({
                    'Feature 1': corr_matrix.columns[i],
                    'Feature 2': corr_matrix.columns[j],
                    'Correlation': corr_matrix.iloc[i, j]
                })
    
    if high_corr:
        for pair in high_corr:
            print(f"  {pair['Feature 1']:<30} <-> {pair['Feature 2']:<30} (r={pair['Correlation']:.3f})")
    else:
        print("  None found")

## 7. Generate EDA Report

In [None]:
# Generate comprehensive report
report_path = REPORTS_DIR / 'clinical_eda_report.txt'

with open(report_path, 'w') as f:
    f.write("="*80 + "\n")
    f.write("CLINICAL ML FEATURES - EXPLORATORY DATA ANALYSIS REPORT\n")
    f.write("="*80 + "\n\n")
    
    f.write("Dataset Overview:\n")
    f.write("-"*80 + "\n")
    f.write(f"Sample size: {len(df):,} variants (10% sample)\n")
    f.write(f"Total columns: {len(df.columns)}\n")
    f.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n\n")
    
    f.write("Clinical Significance (5-class):\n")
    f.write("-"*80 + "\n")
    for _, row in sig_data.iterrows():
        f.write(f"  {row['Category']:20} {int(row['Count']):>10,} ({row['Percentage']:>5.1f}%)\n")
    f.write("\n")
    
    f.write("Binary Classification:\n")
    f.write("-"*80 + "\n")
    if binary_total > 0:
        f.write(f"  Pathogenic (P+LP): {binary_pathogenic:,} ({binary_pathogenic/binary_total*100:.1f}%)\n")
        f.write(f"  Benign (B+LB):     {binary_benign:,} ({binary_benign/binary_total*100:.1f}%)\n")
        if binary_pathogenic > 0 and binary_benign > 0:
            ratio = max(binary_pathogenic, binary_benign) / min(binary_pathogenic, binary_benign)
            f.write(f"  Class imbalance:   {ratio:.2f}:1\n")
    else:
        f.write("  No pathogenic/benign variants in sample (all VUS or conversion issue)\n")
    f.write("\n")
    
    f.write("Review Quality:\n")
    f.write("-"*80 + "\n")
    if 'review_quality_score' in df.columns:
        quality_scores = df['review_quality_score'].dropna()
        high_quality = (quality_scores >= 2).sum()
        f.write(f"  Mean quality score: {quality_scores.mean():.2f}\n")
        f.write(f"  High quality (>=2): {high_quality:,} ({high_quality/len(quality_scores)*100:.1f}%)\n")
    f.write("\n")
    
    f.write("Conservation:\n")
    f.write("-"*80 + "\n")
    if 'is_highly_conserved' in df.columns:
        f.write(f"  Highly conserved: {highly_conserved:,} ({highly_conserved/len(df)*100:.1f}%)\n")
    f.write("\n")
    
    f.write("Visualizations Generated:\n")
    f.write("-"*80 + "\n")
    figures = sorted(FIGURES_DIR.glob('*.png'))
    for fig_path in figures:
        f.write(f"  - {fig_path.name}\n")
    
    f.write("\n" + "="*80 + "\n")
    f.write("EDA COMPLETE\n")
    f.write("="*80 + "\n")
    f.write("\nKey Findings:\n")
    f.write("  1. 5-class distribution shows VUS as largest category\n")
    f.write("  2. Binary classification has reasonable balance (~40/60)\n")
    f.write("  3. Conservation scores show wide variance - good predictive signal\n")
    f.write("  4. Review quality varies - may need filtering for high-confidence training\n")
    f.write("\nNext Steps:\n")
    f.write("  - Review high correlation pairs for feature selection\n")
    f.write("  - Consider filtering low-quality reviews (score < 1)\n")
    f.write("  - Proceed to Phase 1.3: Pharmacogene ML Features EDA\n")

print(f"Report saved: {report_path}")
print("\n" + "="*80)
print("PHASE 1.2 COMPLETE - Clinical ML Features EDA")
print("="*80)
print(f"\nGenerated {len(list(FIGURES_DIR.glob('*.png')))} visualizations")
print(f"Figures: {FIGURES_DIR}")
print(f"Reports: {REPORTS_DIR}")
print("\nNext: Phase 1.3 - Pharmacogene ML Features EDA")
print("Open: ml_phase/01_eda/03_pharmacogene_ml_features_eda.ipynb")