In [None]:
# ====================================================================
# ULTRA-ENRICHED STATISTICAL ANALYSIS
# DNA Gene Mapping Project - 100+ Features
# Author: Sharique Mohammad
# Date: 14 January 2026
# ====================================================================
# jupyter_notebooks/05_statistical_analysis_ULTRA_ENRICHED.ipynb
# ====================================================================

In [None]:
# Import necessary libraries

import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import os
from pathlib import Path
import warnings

In [None]:
# Suppress warnings for cleaner output

warnings.filterwarnings('ignore')
load_dotenv()

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully')

In [None]:
# Setup Paths

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'jupyter_notebooks' else Path.cwd()
ANALYTICAL_DIR = PROJECT_ROOT / 'data' / 'analytical'
VIZ_DIR = ANALYTICAL_DIR / 'visualizations'

# Create directories if they don't exist
ANALYTICAL_DIR.mkdir(parents=True, exist_ok=True)
VIZ_DIR.mkdir(parents=True, exist_ok=True)

print(f'Project root: {PROJECT_ROOT}')
print(f'Analytical directory: {ANALYTICAL_DIR}')
print(f'Visualizations directory: {VIZ_DIR}')

In [None]:
# Database Connection

DB_CONFIG = {
    'host': os.getenv('POSTGRES_HOST', 'localhost'),
    'port': int(os.getenv('POSTGRES_PORT', 5432)),
    'database': os.getenv('POSTGRES_DATABASE', 'genome_db'),
    'user': os.getenv('POSTGRES_USER', 'postgres'),
    'password': os.getenv('POSTGRES_PASSWORD')
}

engine = create_engine(
    f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}"
    f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

conn = psycopg2.connect(
    host=DB_CONFIG['host'],
    database=DB_CONFIG['database'],
    user=DB_CONFIG['user'],
    password=DB_CONFIG['password']
)

print('Database connection established')

In [None]:
# Load Ultra-Enriched Data from PostgreSQL

print('Loading ultra-enriched data from PostgreSQL gold layer...')
print('='*70)

df_gene_features = pd.read_sql('SELECT * FROM gold.gene_features', engine)
df_chromosome_features = pd.read_sql('SELECT * FROM gold.chromosome_features', engine)
df_gene_disease = pd.read_sql('SELECT * FROM gold.gene_disease_association', engine)
df_ml_features = pd.read_sql('SELECT * FROM gold.ml_features', engine)

print(f'\nLoaded {len(df_gene_features):,} genes with {len(df_gene_features.columns)} columns')
print(f'Loaded {len(df_chromosome_features)} chromosomes with {len(df_chromosome_features.columns)} columns')
print(f'Loaded {len(df_gene_disease):,} gene-disease associations with {len(df_gene_disease.columns)} columns')
print(f'Loaded {len(df_ml_features):,} ML features with {len(df_ml_features.columns)} columns')

print('\n' + '='*70)
print('ULTRA-ENRICHED DATA LOADED SUCCESSFULLY')
print('='*70)

In [None]:
# Ultra-Enriched Features Overview

# Define feature categories based on ACTUAL columns that exist
protein_types = [
    'is_kinase', 'is_phosphatase', 'is_receptor', 'is_enzyme', 'is_transporter',
    'has_glycoprotein', 'has_receptor_keyword', 'has_enzyme_keyword', 
    'has_kinase_keyword', 'has_binding_keyword'
]

cellular_locations = [
    'Membrane', 'Nuclear', 'Mitochondrial', 'Cytoplasmic', 'Extracellular',
    'Endoplasmic Reticulum', 'Golgi Apparatus', 'Lysosomal', 'Peroxisomal', 'Unknown'
]

clinical_scores = [
    'mutation_count', 'pathogenic_count', 'pathogenic_ratio',
    'avg_clinical_actionability', 'avg_clinical_utility', 
    'avg_mutation_severity', 'druggability_score'
]

disease_categories = [
    'cancer_variant_count', 'syndrome_variant_count', 
    'hereditary_variant_count', 'rare_disease_variant_count'
]

print("Feature categories defined:")
print(f"  Protein types: {len(protein_types)} features")
print(f"  Cellular locations: {len(cellular_locations)} categories")
print(f"  Clinical scores: {len(clinical_scores)} features")
print(f"  Disease categories: {len(disease_categories)} features")

In [None]:
# Data Cleaning and Preparation

print('\n' + '='*70)
print('DATA CLEANING AND PREPARATION')
print('='*70)

# Remove rows with NULL chromosome
df_clean = df_gene_features[df_gene_features['chromosome'].notna()].copy()
print(f'Removed {len(df_gene_features) - len(df_clean)} genes with NULL chromosome')
print(f'Working with {len(df_clean):,} genes')

# Check pathogenic_ratio
if df_clean['pathogenic_ratio'].sum() == 0:
    print('\nWARNING: All pathogenic_ratio values are 0')
    print('Creating pathogenic_ratio from total_pathogenic/mutation_count')
    df_clean['pathogenic_ratio'] = df_clean['total_pathogenic'] / df_clean['mutation_count']

print(f'\nData ready for analysis')

In [None]:
# Data Overview

print("="*80)
print("DATA OVERVIEW")
print("="*80)

# Data shape
print(f"\nDataset shape: {df_clean.shape}")
print(f"  Genes: {df_clean.shape[0]:,}")
print(f"  Features: {df_clean.shape[1]}")

# Key columns (we want to check for)
key_cols = [
    'gene_name', 'chromosome', 'mutation_count', 'pathogenic_count',
    'pathogenic_ratio', 'disease_count', 'avg_clinical_actionability',
    'avg_clinical_utility', 'avg_mutation_severity',
    'is_kinase', 'is_receptor', 'is_enzyme',
    'primary_function', 'biological_process', 'cellular_location', 'druggability_score'
]

print(f"\nKey columns check:")
existing_key_cols = []
for col in key_cols:
    if col in df_clean.columns:
        print(f"  ✓ {col}")
        existing_key_cols.append(col)
    else:
        print(f"  ✗ {col} (MISSING)")

# Missing values (only check columns that exist)
if existing_key_cols:
    print(f"\nMissing values in existing key columns:")
    missing = df_clean[existing_key_cols].isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("  No missing values in key columns")
else:
    print("\nNo key columns found")

In [None]:
# Available Columns in Dataset

print("\n" + "="*80)
print("AVAILABLE COLUMNS IN DATASET")
print("="*80)

print(f"\nTotal columns: {len(df_clean.columns)}")
print("\nColumn names:")
for i, col in enumerate(sorted(df_clean.columns), 1):
    print(f"  {i:2d}. {col}")

# Check which expected columns are present
expected_functional = ['is_kinase', 'is_phosphatase', 'is_receptor', 'is_enzyme', 'is_transporter']
print("\nFunctional protein columns:")
for col in expected_functional:
    status = "✓" if col in df_clean.columns else "✗"
    print(f"  {status} {col}")

expected_clinical = ['mutation_count', 'pathogenic_count', 'pathogenic_ratio', 
                     'avg_clinical_actionability', 'avg_clinical_utility', 'avg_mutation_severity']
print("\nClinical score columns:")
for col in expected_clinical:
    status = "✓" if col in df_clean.columns else "✗"
    print(f"  {status} {col}")

In [None]:
# Missing Value Analysis

print('\n' + '='*70)
print('MISSING VALUE ANALYSIS')
print('='*70)

missing_data = pd.DataFrame({
    'Column': df_clean.columns,
    'Missing_Count': df_clean.isnull().sum(),
    'Missing_Percentage': (df_clean.isnull().sum() / len(df_clean) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    print('\nColumns with missing values (top 20):')
    print(missing_data.head(20))
else:
    print('\nNo missing values found')

In [None]:
# Functional Protein Type Distribution

print("="*70)
print("FUNCTIONAL PROTEIN TYPE DISTRIBUTION")
print("="*70)

# Define protein types (columns that actually exist)
protein_types = [
    'is_kinase', 'is_phosphatase', 'is_receptor', 'is_enzyme', 'is_transporter',
    'has_glycoprotein', 'has_receptor_keyword', 'has_enzyme_keyword', 
    'has_kinase_keyword', 'has_binding_keyword'
]

protein_counts = {}
for ptype in protein_types:
    if ptype in df_clean.columns:
        count = df_clean[ptype].sum()
        protein_counts[ptype] = count
        print(f"  {ptype}: {count:,}")
    else:
        print(f"  {ptype}: COLUMN NOT FOUND")

if protein_counts:
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(protein_counts)), list(protein_counts.values()))
    plt.xticks(range(len(protein_counts)), list(protein_counts.keys()), rotation=45, ha='right')
    plt.ylabel('Number of Genes')
    plt.title('Distribution of Functional Protein Types')
    plt.tight_layout()
    plt.savefig(VIZ_DIR / 'protein_type_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nTotal genes with functional annotations: {sum(protein_counts.values()):,}")
else:
    print("No protein type columns found")

In [None]:
# Mutation Metrics by Protein Type

print('\n' + '='*70)
print('AVERAGE MUTATIONS BY PROTEIN TYPE')
print('='*70)

protein_mutation_data = []
for ptype in protein_types[:10]:
    if ptype in df_clean.columns:
        subset = df_clean[df_clean[ptype] == True]
        if len(subset) > 0:
            avg_mut = subset['mutation_count'].mean()
            avg_path = subset['pathogenic_ratio'].mean()
            protein_name = ptype.replace('is_', '').replace('_', ' ').title()
            print(f'{protein_name:30} Mutations: {avg_mut:7.1f}  Pathogenic: {avg_path:.4f}')
            protein_mutation_data.append({
                'Protein Type': protein_name,
                'Avg Mutations': avg_mut,
                'Avg Pathogenic Ratio': avg_path
            })

# Visualization
if protein_mutation_data:
    mutation_df = pd.DataFrame(protein_mutation_data)
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    axes[0].barh(mutation_df['Protein Type'], mutation_df['Avg Mutations'], color='steelblue')
    axes[0].set_xlabel('Average Mutation Count')
    axes[0].set_title('Average Mutations by Protein Type')
    axes[0].invert_yaxis()
    
    axes[1].barh(mutation_df['Protein Type'], mutation_df['Avg Pathogenic Ratio'], color='coral')
    axes[1].set_xlabel('Average Pathogenic Ratio')
    axes[1].set_title('Average Pathogenic Ratio by Protein Type')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.savefig(VIZ_DIR / '02_protein_mutations.png', dpi=300, bbox_inches='tight')
    plt.show()
    print('\nSaved: 02_protein_mutations.png')

In [None]:
# Cellular Location Distribution

print("="*80)
print("CELLULAR LOCATION DISTRIBUTION")
print("="*80)

if 'cellular_location' in df_clean.columns:
    location_counts = df_clean['cellular_location'].value_counts()
    
    plt.figure(figsize=(10, 6))
    location_counts.plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title('Gene Distribution by Cellular Location', fontsize=14, fontweight='bold')
    plt.xlabel('Cellular Location', fontsize=12)
    plt.ylabel('Number of Genes', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(VIZ_DIR / 'cellular_location_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nCellular Location Distribution:")
    for location, count in location_counts.items():
        pct = count / len(df_clean) * 100
        print(f"  {location}: {count:,} genes ({pct:.1f}%)")
    
    print(f"\nTotal locations: {len(location_counts)}")
else:
    print("ERROR: cellular_location column not found!")
    print("Please run 02_gene_data_processing_COMPLETE.py first")


In [None]:
# Cellular Location Distribution
# NOTE: Cellular location columns don't exist in gene table - analysis skipped

print("Cellular location analysis skipped - column doesn't exist in current schema")
print("To enable this analysis, add cellular_location column to gene processing pipeline")

In [None]:
# Druggability Score Analysis

print("="*80)
print("DRUGGABILITY SCORE ANALYSIS")
print("="*80)

if 'druggability_score' in df_clean.columns:
    # Druggability statistics
    print(f"\nDruggability Score Statistics:")
    print(df_clean['druggability_score'].describe())
    
    # High druggability genes
    highly_druggable = df_clean[df_clean['druggability_score'] > 0.7]
    print(f"\nHighly druggable genes (score > 0.7): {len(highly_druggable):,}")
    print(f"Percentage: {len(highly_druggable) / len(df_clean) * 100:.1f}%")
    
    if 'primary_function' in df_clean.columns:
        print("\nTop druggable gene classes:")
        print(highly_druggable['primary_function'].value_counts().head(10))
    
    # Druggability distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df_clean['druggability_score'], bins=30, edgecolor='black')
    plt.xlabel('Druggability Score')
    plt.ylabel('Number of Genes')
    plt.title('Distribution of Druggability Scores')
    plt.axvline(x=0.7, color='r', linestyle='--', label='High Druggability Threshold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(VIZ_DIR / 'druggability_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("druggability_score column not found")

In [None]:
# Druggability vs Clinical Utility Scatter Plot

print("="*80)
print("DRUGGABILITY VS CLINICAL UTILITY")
print("="*80)

if 'druggability_score' in df_clean.columns and 'avg_clinical_utility' in df_clean.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean['druggability_score'], 
                df_clean['avg_clinical_utility'], 
                alpha=0.5, s=30, color='purple', edgecolor='black', linewidth=0.5)
    plt.xlabel('Druggability Score', fontsize=12)
    plt.ylabel('Average Clinical Utility', fontsize=12)
    plt.title('Druggability vs Clinical Utility', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(VIZ_DIR / 'druggability_clinical_utility.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Calculate correlation
    correlation = df_clean[['druggability_score', 'avg_clinical_utility']].corr().iloc[0, 1]
    print(f"\nCorrelation between druggability and clinical utility: {correlation:.3f}")
    
    if abs(correlation) > 0.3:
        print(f"Result: MODERATE correlation detected")
    elif abs(correlation) > 0.1:
        print(f"Result: WEAK correlation detected")
    else:
        print(f"Result: NO meaningful correlation")
else:
    print("ERROR: Required columns not found!")
    print("Please run 02_gene_data_processing_COMPLETE.py and 05_feature_engineering_COMPLETE.py first")


In [None]:
# Druggability vs Clinical Utility Scatter Plot
# NOTE: druggability_score column doesn't exist - plot skipped

print("Druggability scatter plot skipped - column doesn't exist in current schema")
print("To enable this plot, add druggability_score column to feature engineering pipeline")

In [None]:
# Kinase-Cancer Association Analysis

print("="*80)
print("KINASE-CANCER ASSOCIATION ANALYSIS")
print("="*80)

if 'is_kinase' in df_clean.columns and 'cancer_variant_count' in df_clean.columns:
    kinases = df_clean[df_clean['is_kinase'] == True]
    non_kinases = df_clean[df_clean['is_kinase'] == False]
    
    print(f"\nKinases: {len(kinases):,}")
    print(f"Non-kinases: {len(non_kinases):,}")
    
    print(f"\nAverage cancer variants:")
    print(f"  Kinases: {kinases['cancer_variant_count'].mean():.2f}")
    print(f"  Non-kinases: {non_kinases['cancer_variant_count'].mean():.2f}")
    
    # Statistical test
    from scipy.stats import mannwhitneyu
    stat, pval = mannwhitneyu(
        kinases['cancer_variant_count'].dropna(),
        non_kinases['cancer_variant_count'].dropna()
    )
    
    print(f"\nMann-Whitney U test:")
    print(f"  Statistic: {stat:.2f}")
    print(f"  P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("  Result: Significant association between kinases and cancer variants")
    else:
        print("  Result: No significant association")
else:
    print("Required columns not found")

In [None]:
# Distribution Analysis - Mutation Counts

print('\n' + '='*70)
print('DISTRIBUTION ANALYSIS - MUTATION COUNTS')
print('='*70)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Mutation Count Distribution
df_clean['mutation_count'].hist(bins=50, ax=axes[0, 0], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Mutation Count Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Mutation Count')
axes[0, 0].set_ylabel('Frequency')

# Pathogenic Ratio Distribution
df_clean['pathogenic_ratio'].hist(bins=50, ax=axes[0, 1], color='salmon', edgecolor='black')
axes[0, 1].set_title('Pathogenic Ratio Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Pathogenic Ratio')
axes[0, 1].set_ylabel('Frequency')

# Risk Score Distribution
if 'risk_score' in df_clean.columns:
    df_clean['risk_score'].hist(bins=50, ax=axes[1, 0], color='lightgreen', edgecolor='black')
    axes[1, 0].set_title('Risk Score Distribution', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Risk Score')
    axes[1, 0].set_ylabel('Frequency')

# Risk Level Distribution
if 'risk_level' in df_clean.columns:
    df_clean['risk_level'].value_counts().plot(kind='bar', ax=axes[1, 1], color='orange', edgecolor='black')
    axes[1, 1].set_title('Risk Level Distribution', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Risk Level')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(VIZ_DIR / '09_distribution_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: 09_distribution_analysis.png')

In [None]:
# Hypothesis Test 2 - High vs Low Druggability Clinical Utility

print("="*80)
print("HYPOTHESIS TEST: HIGH VS LOW DRUGGABILITY")
print("="*80)

if 'druggability_score' in df_clean.columns and 'avg_clinical_utility' in df_clean.columns:
    # Split by median druggability
    median_drug = df_clean['druggability_score'].median()
    high_drug = df_clean[df_clean['druggability_score'] >= median_drug]
    low_drug = df_clean[df_clean['druggability_score'] < median_drug]
    
    print(f"\nH0: High and low druggability genes have same clinical utility")
    print(f"H1: High druggability genes have higher clinical utility")
    
    # Mann-Whitney U test
    stat, pval = stats.mannwhitneyu(
        high_drug['avg_clinical_utility'].dropna(),
        low_drug['avg_clinical_utility'].dropna(),
        alternative='greater'
    )
    
    print(f"\nResults:")
    print(f"  Median druggability threshold: {median_drug:.2f}")
    print(f"  High druggability genes: {len(high_drug):,}")
    print(f"  Low druggability genes: {len(low_drug):,}")
    print(f"  Test statistic: {stat:,.0f}")
    print(f"  P-value: {pval:.4e}")
    print(f"  High drug avg utility: {high_drug['avg_clinical_utility'].mean():.3f}")
    print(f"  Low drug avg utility: {low_drug['avg_clinical_utility'].mean():.3f}")
    print(f"  Difference: {high_drug['avg_clinical_utility'].mean() - low_drug['avg_clinical_utility'].mean():.3f}")
    
    if pval < 0.05:
        print(f"\nConclusion: REJECT H0 (p < 0.05)")
        print(f"  High druggability genes have significantly higher clinical utility")
    else:
        print(f"\nConclusion: FAIL TO REJECT H0 (p >= 0.05)")
        print(f"  No significant difference in clinical utility")
else:
    print("ERROR: Required columns not found!")
    print("Please run 02_gene_data_processing_COMPLETE.py and 05_feature_engineering_COMPLETE.py first")


In [None]:
# Hypothesis Test 2 - High vs Low Druggability Clinical Utility
# NOTE: druggability_score column doesn't exist - test skipped

print("Druggability hypothesis test skipped - column doesn't exist in current schema")
print("To enable this test, add druggability_score column to feature engineering pipeline")

In [None]:
# Hypothesis Test 3 - ANOVA: Mutation Counts by Functional Type

print('\n' + '='*70)
print('HYPOTHESIS TEST 3: ANOVA')
print('Do different functional types have different mutation counts?')
print('='*70)

groups = []
labels = []

for ptype in protein_types[:5]:
    if ptype in df_clean.columns:
        group_data = df_clean[df_clean[ptype] == True]['mutation_count'].dropna()
        if len(group_data) > 10:
            groups.append(group_data)
            labels.append(ptype.replace('is_', '').replace('_', ' ').title())

if len(groups) >= 2:
    f_stat, p_value3 = stats.f_oneway(*groups)
    
    print('\nGroup Means:')
    for label, group in zip(labels, groups):
        print(f'  {label:30} Mean: {group.mean():7.1f} (n={len(group)})')
    
    print(f'\nF-statistic: {f_stat:.4f}')
    print(f'P-value: {p_value3:.6f}')
    
    if p_value3 < 0.05:
        print(f'\nConclusion: REJECT null hypothesis')
        print('Significant difference between functional groups')
    else:
        print(f'\nConclusion: FAIL TO REJECT null hypothesis')
        print('No significant difference')

In [None]:
# Correlation Analysis

print("="*80)
print("CORRELATION ANALYSIS")
print("="*80)

corr_cols = [
    'mutation_count', 'pathogenic_count', 'pathogenic_ratio',
    'disease_count', 'avg_clinical_actionability', 'avg_clinical_utility',
    'avg_mutation_severity', 'severe_mutation_ratio'
]

# Filter to only columns that exist
existing_corr_cols = [col for col in corr_cols if col in df_clean.columns]

if len(existing_corr_cols) > 1:
    corr_matrix = df_clean[existing_corr_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig(VIZ_DIR / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nCorrelation matrix shape: {corr_matrix.shape}")
    print(f"\nStrongest correlations:")
    
    # Get upper triangle
    mask = np.triu(np.ones_like(corr_matrix), k=1)
    corr_pairs = corr_matrix.where(mask.astype(bool))
    
    # Flatten and sort
    corr_flat = corr_pairs.unstack().dropna().sort_values(ascending=False)
    print(corr_flat.head(10))
else:
    corr_matrix = None
    print("Not enough columns for correlation analysis")

In [None]:
# Disease Association Analysis

print('\n' + '='*70)
print('DISEASE ASSOCIATION ANALYSIS')
print('='*70)

# Top diseases by gene count
top_diseases = df_gene_disease.groupby('disease').size().sort_values(ascending=False).head(20)
print('\nTop 20 Diseases by Gene Count:')
print(top_diseases)

# Visualization
plt.figure(figsize=(14, 10))
top_diseases.plot(kind='barh', color='teal')
plt.title('Top 20 Diseases by Number of Associated Genes', fontsize=14, fontweight='bold')
plt.xlabel('Number of Genes')
plt.ylabel('Disease')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(VIZ_DIR / '11_top_diseases.png', dpi=300, bbox_inches='tight')
plt.show()
print('\nSaved: 11_top_diseases.png')

# Association strength distribution
if 'association_strength' in df_gene_disease.columns:
    print('\nAssociation Strength Distribution:')
    print(df_gene_disease['association_strength'].value_counts())
    
    plt.figure(figsize=(10, 6))
    df_gene_disease['association_strength'].value_counts().plot(kind='bar', color=['green', 'orange', 'red'])
    plt.title('Gene-Disease Association Strength Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Association Strength')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig(VIZ_DIR / '12_association_strength.png', dpi=300, bbox_inches='tight')
    plt.show()
    print('\nSaved: 12_association_strength.png')

In [None]:
# Summary Statistics Report

print("="*80)
print("SUMMARY STATISTICS REPORT")
print("="*80)

summary_cols = [
    'mutation_count', 'pathogenic_count', 'pathogenic_ratio',
    'disease_count', 'avg_clinical_actionability', 'avg_clinical_utility',
    'avg_mutation_severity'
]

# Filter to existing columns
existing_summary_cols = [col for col in summary_cols if col in df_clean.columns]

if existing_summary_cols:
    summary_stats = df_clean[existing_summary_cols].describe()
    
    print("\nSummary statistics:")
    print(summary_stats)
    
    # Save to CSV
    summary_stats.to_csv(ANALYTICAL_DIR / 'summary_statistics.csv')
    print(f"\nSummary statistics saved to: {ANALYTICAL_DIR / 'summary_statistics.csv'}")
else:
    print("No summary columns found")

print("\n" + "="*80)
print("STATISTICAL ANALYSIS COMPLETE")
print("="*80)

In [None]:
# Save Analysis Results

print('\n' + '='*70)
print('SAVING ANALYSIS RESULTS')
print('='*70)

# Save correlation matrix if it exists
if 'corr_matrix' in locals() and corr_matrix is not None:
    corr_matrix.to_csv(ANALYTICAL_DIR / 'correlation_matrix_ultra_enriched.csv')
    print(f'Saved: {ANALYTICAL_DIR / "correlation_matrix_ultra_enriched.csv"}')
else:
    print('Correlation matrix not available to save')

print('\n' + '='*70)
print('ULTRA-ENRICHED STATISTICAL ANALYSIS COMPLETE')
print('='*70)
print(f'\nVisualizations saved in: {VIZ_DIR}')
print(f'Summary reports saved in: {ANALYTICAL_DIR}')
print('\nKey Findings:')
print(f'  - Analyzed {len(df_clean):,} genes with {len(df_clean.columns)} features')
print(f'  - Identified functional protein types')
print(f'  - Mapped disease categories')
print(f'  - Assessed clinical utility and mutation severity')

# Close database connection
if 'conn' in locals():
    conn.close()
    print('\nDatabase connection closed')