# 📊 VEP Visualization and Export Tutorial

This tutorial focuses on creating comprehensive visualizations and exporting results from Variant Effect Prediction (VEP) analysis using OmniGenBench.

## Overview

In this tutorial, we'll explore:
- Advanced visualization techniques for variant effect analysis
- Statistical analysis and interpretation of results
- Multiple export formats for downstream analysis
- Publication-ready plots and reports

## 1. Setup and Data Loading

First, let's set up our environment and load sample VEP results for visualization.

In [None]:
# Import visualization and analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from typing import Dict, List, Optional
from pathlib import Path
import json

# Statistical analysis
from scipy import stats
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✅ Visualization libraries imported successfully!")
print("📊 Ready for VEP result analysis and visualization")

In [None]:
# Create sample VEP results for demonstration
# In practice, this would be loaded from previous analysis
def create_sample_vep_data(n_variants: int = 200) -> pd.DataFrame:
    """
    Create sample VEP analysis results for demonstration.
    
    Args:
        n_variants: Number of variants to simulate
        
    Returns:
        DataFrame with VEP analysis results
    """
    np.random.seed(42)  # For reproducible results
    
    # Variant types and their properties
    variant_types = ['SNV_transition', 'SNV_transversion', 'insertion', 'deletion']
    chromosomes = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY']
    
    data = []
    for i in range(n_variants):
        # Generate variant properties
        var_type = np.random.choice(variant_types)
        chromosome = np.random.choice(chromosomes)
        position = np.random.randint(1000000, 250000000)
        
        # Generate realistic effect scores based on variant type
        if var_type.startswith('SNV'):
            base_score = np.random.beta(2, 8)  # Most SNVs have low effect
            if 'transition' in var_type:
                base_score *= 0.8  # Transitions generally less disruptive
        else:  # INDELs
            base_score = np.random.beta(3, 5)  # INDELs more likely to be functional
        
        # Add some noise and ensure realistic ranges
        cosine_score = np.clip(base_score + np.random.normal(0, 0.05), 0, 1)
        euclidean_score = cosine_score * np.random.uniform(10, 100)  # Scale appropriately
        manhattan_score = cosine_score * np.random.uniform(50, 500)
        
        # Generate functional prediction based on threshold
        threshold = 0.15
        predicted_functional = cosine_score > threshold
        
        # Generate confidence and additional metrics
        confidence = np.random.uniform(0.6, 0.95) if predicted_functional else np.random.uniform(0.3, 0.7)
        
        # Simulate true labels (with some noise for realistic scenarios)
        true_functional = predicted_functional
        if np.random.random() < 0.1:  # 10% label noise
            true_functional = not true_functional
        
        data.append({
            'variant_id': f'var_{i+1}',
            'chromosome': chromosome,
            'position': position,
            'variant_type': var_type,
            'ref_allele': np.random.choice(['A', 'T', 'G', 'C']),
            'alt_allele': np.random.choice(['A', 'T', 'G', 'C']),
            'cosine_score': cosine_score,
            'euclidean_score': euclidean_score,
            'manhattan_score': manhattan_score,
            'predicted_functional': predicted_functional,
            'true_functional': true_functional,
            'confidence': confidence,
            'genomic_region': np.random.choice(['exonic', 'intronic', 'intergenic', 'promoter', 'UTR']),
            'population_frequency': np.random.exponential(0.01),  # Most variants are rare
        })
    
    return pd.DataFrame(data)

# Generate sample data
vep_results = create_sample_vep_data(200)

print(f"📊 Sample VEP Dataset Created:")
print(f"  🧬 Total variants: {len(vep_results)}")
print(f"  ⚡ Predicted functional: {vep_results['predicted_functional'].sum()}")
print(f"  📈 Mean cosine score: {vep_results['cosine_score'].mean():.4f}")
print(f"  🎯 Variant types: {vep_results['variant_type'].value_counts().to_dict()}")

# Display sample
print(f"\n🔍 Sample data preview:")
print(vep_results.head()[['variant_id', 'variant_type', 'cosine_score', 'predicted_functional']].to_string(index=False))

## 2. Basic Statistical Visualizations

Create fundamental visualizations to understand the distribution and characteristics of variant effect scores.

In [None]:
# Create comprehensive statistical visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('🧬 VEP Analysis: Statistical Overview', fontsize=16, fontweight='bold')

# 1. Effect Score Distribution
axes[0, 0].hist(vep_results['cosine_score'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(vep_results['cosine_score'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {vep_results["cosine_score"].mean():.3f}')
axes[0, 0].axvline(0.15, color='orange', linestyle='--', label='Threshold: 0.15')
axes[0, 0].set_xlabel('Cosine Distance Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('📊 Effect Score Distribution')
axes[0, 0].legend()

# 2. Functional vs Neutral Scores
functional_scores = vep_results[vep_results['predicted_functional']]['cosine_score']
neutral_scores = vep_results[~vep_results['predicted_functional']]['cosine_score']

axes[0, 1].hist([functional_scores, neutral_scores], bins=20, alpha=0.7,
                label=['Functional', 'Neutral'], color=['coral', 'lightblue'])
axes[0, 1].set_xlabel('Cosine Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('⚡ Functional vs Neutral Distribution')
axes[0, 1].legend()

# 3. Variant Type Effect Scores
variant_type_scores = [vep_results[vep_results['variant_type'] == vt]['cosine_score'].values 
                      for vt in vep_results['variant_type'].unique()]
axes[0, 2].boxplot(variant_type_scores, labels=vep_results['variant_type'].unique())
axes[0, 2].set_xlabel('Variant Type')
axes[0, 2].set_ylabel('Cosine Score')
axes[0, 2].set_title('🔄 Effect Scores by Variant Type')
axes[0, 2].tick_params(axis='x', rotation=45)

# 4. Chromosome Distribution
chrom_counts = vep_results['chromosome'].value_counts().head(10)
axes[1, 0].bar(range(len(chrom_counts)), chrom_counts.values, color='lightgreen')
axes[1, 0].set_xticks(range(len(chrom_counts)))
axes[1, 0].set_xticklabels(chrom_counts.index, rotation=45)
axes[1, 0].set_xlabel('Chromosome')
axes[1, 0].set_ylabel('Variant Count')
axes[1, 0].set_title('🗺️ Variant Distribution by Chromosome')

# 5. Score Correlation Matrix
score_cols = ['cosine_score', 'euclidean_score', 'manhattan_score', 'confidence']
corr_matrix = vep_results[score_cols].corr()
im = axes[1, 1].imshow(corr_matrix, cmap='coolwarm', aspect='auto')
axes[1, 1].set_xticks(range(len(score_cols)))
axes[1, 1].set_yticks(range(len(score_cols)))
axes[1, 1].set_xticklabels([col.replace('_', ' ').title() for col in score_cols])
axes[1, 1].set_yticklabels([col.replace('_', ' ').title() for col in score_cols])
axes[1, 1].set_title('🔗 Score Correlations')

# Add correlation values
for i in range(len(score_cols)):
    for j in range(len(score_cols)):
        axes[1, 1].text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                        ha='center', va='center', color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black')

# 6. Population Frequency vs Effect Score
scatter = axes[1, 2].scatter(vep_results['population_frequency'], vep_results['cosine_score'],
                            c=vep_results['predicted_functional'], cmap='RdYlBu_r', alpha=0.6)
axes[1, 2].set_xlabel('Population Frequency')
axes[1, 2].set_ylabel('Cosine Score')
axes[1, 2].set_title('📈 Effect Score vs Population Frequency')
axes[1, 2].set_xscale('log')
plt.colorbar(scatter, ax=axes[1, 2], label='Functional Prediction')

plt.tight_layout()
plt.show()

print("✅ Basic statistical visualizations created!")

## 3. Advanced Interactive Visualizations

Create interactive plots using Plotly for better data exploration.

In [None]:
# Create interactive visualizations using Plotly
print("🚀 Creating interactive visualizations...")

# 1. Interactive Scatter Plot with Multiple Dimensions
fig_scatter = px.scatter(
    vep_results, 
    x='cosine_score', 
    y='euclidean_score',
    color='variant_type',
    size='confidence',
    hover_data=['variant_id', 'chromosome', 'position', 'predicted_functional'],
    title='🧬 Interactive VEP Score Analysis',
    labels={
        'cosine_score': 'Cosine Distance Score',
        'euclidean_score': 'Euclidean Distance Score',
        'variant_type': 'Variant Type'
    }
)
fig_scatter.add_hline(y=vep_results['euclidean_score'].mean(), line_dash="dash", 
                     annotation_text="Mean Euclidean Score")
fig_scatter.add_vline(x=0.15, line_dash="dash", line_color="red",
                     annotation_text="Functional Threshold")
fig_scatter.show()

# 2. Interactive Genomic Distribution
fig_genome = px.scatter(
    vep_results,
    x='position',
    y='cosine_score',
    color='predicted_functional',
    facet_col='chromosome',
    facet_col_wrap=6,
    title='🗺️ Genomic Distribution of Variant Effects',
    labels={
        'position': 'Genomic Position',
        'cosine_score': 'Effect Score',
        'predicted_functional': 'Functional Prediction'
    },
    height=800
)
fig_genome.show()

# 3. Interactive Box Plot by Genomic Region
fig_box = px.box(
    vep_results,
    x='genomic_region',
    y='cosine_score',
    color='variant_type',
    title='📊 Effect Scores by Genomic Region and Variant Type',
    labels={
        'genomic_region': 'Genomic Region',
        'cosine_score': 'Effect Score',
        'variant_type': 'Variant Type'
    }
)
fig_box.show()

print("✅ Interactive visualizations created!")

## 4. Performance Analysis and ROC Curves

Analyze prediction performance using ROC curves and classification metrics.

In [None]:
# Analyze prediction performance
print("📈 Analyzing prediction performance...")

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(vep_results['true_functional'], vep_results['cosine_score'])
roc_auc = auc(fpr, tpr)

# Create performance analysis plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('🎯 VEP Prediction Performance Analysis', fontsize=16, fontweight='bold')

# 1. ROC Curve
axes[0, 0].plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
axes[0, 0].plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random Classifier')
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('📊 ROC Curve Analysis')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Precision-Recall Analysis
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, pr_thresholds = precision_recall_curve(vep_results['true_functional'], vep_results['cosine_score'])
avg_precision = average_precision_score(vep_results['true_functional'], vep_results['cosine_score'])

axes[0, 1].plot(recall, precision, color='red', lw=2, label=f'PR Curve (AP = {avg_precision:.3f})')
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('📈 Precision-Recall Curve')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Confusion Matrix
cm = confusion_matrix(vep_results['true_functional'], vep_results['predicted_functional'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0],
            xticklabels=['Predicted Neutral', 'Predicted Functional'],
            yticklabels=['True Neutral', 'True Functional'])
axes[1, 0].set_title('🎯 Confusion Matrix')
axes[1, 0].set_ylabel('True Label')
axes[1, 0].set_xlabel('Predicted Label')

# 4. Score Distribution by True Label
true_func_scores = vep_results[vep_results['true_functional']]['cosine_score']
true_neut_scores = vep_results[~vep_results['true_functional']]['cosine_score']

axes[1, 1].hist([true_func_scores, true_neut_scores], bins=20, alpha=0.7,
                label=['True Functional', 'True Neutral'], color=['orange', 'cyan'])
axes[1, 1].axvline(0.15, color='red', linestyle='--', label='Classification Threshold')
axes[1, 1].set_xlabel('Cosine Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('📊 Score Distribution by True Label')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Print classification report
print("\n📋 Classification Report:")
print(classification_report(vep_results['true_functional'], vep_results['predicted_functional'],
                          target_names=['Neutral', 'Functional']))

# Performance metrics summary
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
ppv = tp / (tp + fp)
npv = tn / (tn + fn)

print(f"\n🎯 Performance Metrics Summary:")
print(f"  📊 ROC AUC: {roc_auc:.3f}")
print(f"  📈 Average Precision: {avg_precision:.3f}")
print(f"  🎯 Sensitivity (Recall): {sensitivity:.3f}")
print(f"  🛡️ Specificity: {specificity:.3f}")
print(f"  ✅ Positive Predictive Value: {ppv:.3f}")
print(f"  ❌ Negative Predictive Value: {npv:.3f}")

## 5. Publication-Ready Visualizations

Create high-quality, publication-ready plots with proper styling and annotations.

In [None]:
# Create publication-ready visualizations
print("📄 Creating publication-ready visualizations...")

# Set publication style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.titlesize': 16
})

# Publication Figure 1: Comprehensive Effect Score Analysis
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)

# Main distribution plot
ax_main = fig.add_subplot(gs[0, :2])
n, bins, patches = ax_main.hist(vep_results['cosine_score'], bins=40, alpha=0.7, 
                               color='steelblue', edgecolor='black', linewidth=0.5)
ax_main.axvline(vep_results['cosine_score'].mean(), color='red', linestyle='--', 
               linewidth=2, label=f'Mean: {vep_results["cosine_score"].mean():.3f}')
ax_main.axvline(0.15, color='orange', linestyle='--', linewidth=2, 
               label='Functional Threshold: 0.15')
ax_main.set_xlabel('Variant Effect Score (Cosine Distance)')
ax_main.set_ylabel('Number of Variants')
ax_main.set_title('A. Distribution of Variant Effect Scores', fontweight='bold')
ax_main.legend()
ax_main.grid(True, alpha=0.3)

# Functional vs Neutral comparison
ax_comp = fig.add_subplot(gs[0, 2:])
parts = ax_comp.violinplot([functional_scores, neutral_scores], positions=[1, 2], 
                          showmeans=True, showmedians=True)
ax_comp.set_xticks([1, 2])
ax_comp.set_xticklabels(['Functional\n(n={})'.format(len(functional_scores)), 
                        'Neutral\n(n={})'.format(len(neutral_scores))])
ax_comp.set_ylabel('Effect Score')
ax_comp.set_title('B. Functional vs Neutral Variants', fontweight='bold')
ax_comp.grid(True, alpha=0.3)

# ROC Curve
ax_roc = fig.add_subplot(gs[1, :2])
ax_roc.plot(fpr, tpr, color='blue', lw=3, label=f'ROC Curve (AUC = {roc_auc:.3f})')
ax_roc.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', alpha=0.7)
ax_roc.set_xlabel('False Positive Rate')
ax_roc.set_ylabel('True Positive Rate')
ax_roc.set_title('C. Receiver Operating Characteristic', fontweight='bold')
ax_roc.legend()
ax_roc.grid(True, alpha=0.3)

# Precision-Recall Curve
ax_pr = fig.add_subplot(gs[1, 2:])
ax_pr.plot(recall, precision, color='red', lw=3, label=f'PR Curve (AP = {avg_precision:.3f})')
ax_pr.set_xlabel('Recall (Sensitivity)')
ax_pr.set_ylabel('Precision')
ax_pr.set_title('D. Precision-Recall Curve', fontweight='bold')
ax_pr.legend()
ax_pr.grid(True, alpha=0.3)

# Variant type analysis
ax_type = fig.add_subplot(gs[2, :2])
type_summary = vep_results.groupby('variant_type').agg({
    'cosine_score': ['mean', 'std'],
    'predicted_functional': 'sum'
}).round(3)
type_summary.columns = ['Mean_Score', 'Std_Score', 'Functional_Count']
type_summary = type_summary.reset_index()

x_pos = np.arange(len(type_summary))
ax_type.bar(x_pos, type_summary['Mean_Score'], yerr=type_summary['Std_Score'],
           capsize=5, color='lightcoral', alpha=0.8, edgecolor='black')
ax_type.set_xticks(x_pos)
ax_type.set_xticklabels(type_summary['variant_type'], rotation=45, ha='right')
ax_type.set_ylabel('Mean Effect Score')
ax_type.set_title('E. Effect Scores by Variant Type', fontweight='bold')
ax_type.grid(True, alpha=0.3)

# Performance metrics summary
ax_metrics = fig.add_subplot(gs[2, 2:])
metrics_data = {
    'Metric': ['Sensitivity', 'Specificity', 'PPV', 'NPV', 'ROC AUC'],
    'Value': [sensitivity, specificity, ppv, npv, roc_auc]
}
metrics_df = pd.DataFrame(metrics_data)

bars = ax_metrics.bar(metrics_df['Metric'], metrics_df['Value'], 
                     color=['skyblue', 'lightgreen', 'gold', 'plum', 'salmon'],
                     edgecolor='black', alpha=0.8)
ax_metrics.set_ylabel('Performance Score')
ax_metrics.set_title('F. Performance Metrics Summary', fontweight='bold')
ax_metrics.set_ylim(0, 1)
ax_metrics.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, metrics_df['Value']):
    height = bar.get_height()
    ax_metrics.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

plt.suptitle('Variant Effect Prediction Analysis Results', fontsize=18, fontweight='bold', y=0.98)
plt.show()

print("✅ Publication-ready visualization created!")

## 6. Export Functions and Formats

Implement comprehensive export functionality for different output formats.

In [None]:
# Export functions for different formats
import os
from datetime import datetime

def export_vep_results(results_df: pd.DataFrame, output_dir: str = "vep_exports") -> Dict[str, str]:
    """
    Export VEP results in multiple formats.
    
    Args:
        results_df: DataFrame with VEP analysis results
        output_dir: Directory to save exported files
        
    Returns:
        Dictionary of exported file paths
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    exported_files = {}
    
    # 1. CSV Export (basic)
    csv_file = os.path.join(output_dir, f"vep_results_{timestamp}.csv")
    results_df.to_csv(csv_file, index=False)
    exported_files['csv'] = csv_file
    
    # 2. Excel Export (multiple sheets)
    excel_file = os.path.join(output_dir, f"vep_analysis_{timestamp}.xlsx")
    with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
        # All results
        results_df.to_excel(writer, sheet_name='All_Variants', index=False)
        
        # Functional variants only
        functional_variants = results_df[results_df['predicted_functional']]
        functional_variants.to_excel(writer, sheet_name='Functional_Variants', index=False)
        
        # Summary statistics
        summary_stats = results_df.groupby('variant_type').agg({
            'cosine_score': ['count', 'mean', 'std', 'min', 'max'],
            'predicted_functional': 'sum'
        }).round(4)
        summary_stats.to_excel(writer, sheet_name='Summary_Statistics')
        
        # Performance metrics
        perf_metrics = pd.DataFrame({
            'Metric': ['True Positives', 'False Positives', 'True Negatives', 'False Negatives',
                      'Sensitivity', 'Specificity', 'PPV', 'NPV', 'ROC AUC'],
            'Value': [tp, fp, tn, fn, sensitivity, specificity, ppv, npv, roc_auc]
        })
        perf_metrics.to_excel(writer, sheet_name='Performance_Metrics', index=False)
    
    exported_files['excel'] = excel_file
    
    # 3. JSON Export (for web applications)
    json_file = os.path.join(output_dir, f"vep_results_{timestamp}.json")
    # Convert to records format for JSON
    json_data = {
        'analysis_info': {
            'timestamp': timestamp,
            'total_variants': len(results_df),
            'functional_variants': results_df['predicted_functional'].sum(),
            'mean_effect_score': float(results_df['cosine_score'].mean()),
            'performance_metrics': {
                'roc_auc': float(roc_auc),
                'sensitivity': float(sensitivity),
                'specificity': float(specificity)
            }
        },
        'variants': results_df.to_dict('records')
    }
    
    with open(json_file, 'w') as f:
        json.dump(json_data, f, indent=2)
    exported_files['json'] = json_file
    
    # 4. BED format for genomic tools
    bed_file = os.path.join(output_dir, f"functional_variants_{timestamp}.bed")
    bed_data = functional_variants[['chromosome', 'position', 'position', 'variant_id', 'cosine_score']].copy()
    bed_data['end_position'] = bed_data['position'] + 1  # BED format uses 0-based coordinates
    bed_data = bed_data[['chromosome', 'position', 'end_position', 'variant_id', 'cosine_score']]
    bed_data.to_csv(bed_file, sep='\t', index=False, header=False)
    exported_files['bed'] = bed_file
    
    # 5. VCF-like format for compatibility
    vcf_file = os.path.join(output_dir, f"vep_annotations_{timestamp}.txt")
    vcf_data = results_df[['chromosome', 'position', 'variant_id', 'ref_allele', 'alt_allele', 
                          'cosine_score', 'predicted_functional', 'confidence']].copy()
    vcf_data.to_csv(vcf_file, sep='\t', index=False)
    exported_files['vcf_like'] = vcf_file
    
    return exported_files

def create_analysis_report(results_df: pd.DataFrame, output_dir: str = "vep_exports") -> str:
    """
    Generate a comprehensive analysis report.
    
    Args:
        results_df: DataFrame with VEP analysis results
        output_dir: Directory to save report
        
    Returns:
        Path to generated report
    """
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_file = os.path.join(output_dir, f"vep_analysis_report_{timestamp}.txt")
    
    with open(report_file, 'w') as f:
        f.write("=" * 60 + "\n")
        f.write("VARIANT EFFECT PREDICTION ANALYSIS REPORT\n")
        f.write("=" * 60 + "\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        # Dataset Summary
        f.write("DATASET SUMMARY\n")
        f.write("-" * 20 + "\n")
        f.write(f"Total variants analyzed: {len(results_df)}\n")
        f.write(f"Predicted functional variants: {results_df['predicted_functional'].sum()}\n")
        f.write(f"Predicted neutral variants: {(~results_df['predicted_functional']).sum()}\n")
        f.write(f"Functional rate: {results_df['predicted_functional'].mean():.1%}\n\n")
        
        # Effect Score Statistics
        f.write("EFFECT SCORE STATISTICS\n")
        f.write("-" * 25 + "\n")
        f.write(f"Mean cosine score: {results_df['cosine_score'].mean():.4f}\n")
        f.write(f"Median cosine score: {results_df['cosine_score'].median():.4f}\n")
        f.write(f"Standard deviation: {results_df['cosine_score'].std():.4f}\n")
        f.write(f"Min score: {results_df['cosine_score'].min():.4f}\n")
        f.write(f"Max score: {results_df['cosine_score'].max():.4f}\n\n")
        
        # Variant Type Analysis
        f.write("VARIANT TYPE ANALYSIS\n")
        f.write("-" * 22 + "\n")
        type_stats = results_df.groupby('variant_type').agg({
            'cosine_score': ['count', 'mean', 'std'],
            'predicted_functional': 'sum'
        }).round(4)
        f.write(type_stats.to_string() + "\n\n")
        
        # Performance Metrics
        f.write("PERFORMANCE METRICS\n")
        f.write("-" * 20 + "\n")
        f.write(f"ROC AUC: {roc_auc:.3f}\n")
        f.write(f"Sensitivity (Recall): {sensitivity:.3f}\n")
        f.write(f"Specificity: {specificity:.3f}\n")
        f.write(f"Positive Predictive Value: {ppv:.3f}\n")
        f.write(f"Negative Predictive Value: {npv:.3f}\n\n")
        
        # Top Functional Variants
        f.write("TOP 10 FUNCTIONAL VARIANTS\n")
        f.write("-" * 27 + "\n")
        top_functional = results_df[results_df['predicted_functional']].nlargest(10, 'cosine_score')
        for i, (_, variant) in enumerate(top_functional.iterrows(), 1):
            f.write(f"{i:2d}. {variant['variant_id']} ({variant['chromosome']}:{variant['position']}) - ")
            f.write(f"Score: {variant['cosine_score']:.4f}\n")
        
        f.write("\n" + "=" * 60 + "\n")
        f.write("End of Report\n")
    
    return report_file

# Export results in multiple formats
print("📤 Exporting VEP results in multiple formats...")
exported_files = export_vep_results(vep_results)

print("✅ Export completed!")
print("📁 Exported files:")
for format_type, file_path in exported_files.items():
    print(f"  📄 {format_type.upper()}: {file_path}")

# Generate analysis report
report_file = create_analysis_report(vep_results)
print(f"  📋 Report: {report_file}")

print(f"\n💾 All files exported to: vep_exports/")

## 7. Custom Visualization Functions

Create reusable functions for generating specific types of VEP visualizations.

In [None]:
def create_vep_dashboard(results_df: pd.DataFrame, save_path: Optional[str] = None) -> None:
    """
    Create a comprehensive VEP analysis dashboard.
    
    Args:
        results_df: DataFrame with VEP results
        save_path: Optional path to save the dashboard image
    """
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(3, 4, hspace=0.4, wspace=0.3)
    
    # Color palette
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']
    
    # 1. Score distribution
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.hist(results_df['cosine_score'], bins=30, alpha=0.7, color=colors[0], edgecolor='black')
    ax1.axvline(0.15, color='red', linestyle='--', linewidth=2, label='Threshold')
    ax1.set_title('Effect Score Distribution', fontweight='bold')
    ax1.set_xlabel('Cosine Score')
    ax1.set_ylabel('Frequency')
    ax1.legend()
    
    # 2. Functional prediction pie chart
    ax2 = fig.add_subplot(gs[0, 1])
    func_counts = results_df['predicted_functional'].value_counts()
    ax2.pie(func_counts.values, labels=['Neutral', 'Functional'], colors=[colors[1], colors[2]], 
            autopct='%1.1f%%', startangle=90)
    ax2.set_title('Functional Predictions', fontweight='bold')
    
    # 3. Variant type distribution
    ax3 = fig.add_subplot(gs[0, 2])
    type_counts = results_df['variant_type'].value_counts()
    ax3.bar(range(len(type_counts)), type_counts.values, color=colors[:len(type_counts)])
    ax3.set_xticks(range(len(type_counts)))
    ax3.set_xticklabels(type_counts.index, rotation=45, ha='right')
    ax3.set_title('Variant Types', fontweight='bold')
    ax3.set_ylabel('Count')
    
    # 4. ROC curve
    ax4 = fig.add_subplot(gs[0, 3])
    ax4.plot(fpr, tpr, color=colors[3], lw=3, label=f'AUC = {roc_auc:.3f}')
    ax4.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax4.set_title('ROC Curve', fontweight='bold')
    ax4.set_xlabel('False Positive Rate')
    ax4.set_ylabel('True Positive Rate')
    ax4.legend()
    
    # 5. Score by genomic region
    ax5 = fig.add_subplot(gs[1, :2])
    region_data = [results_df[results_df['genomic_region'] == region]['cosine_score'].values 
                   for region in results_df['genomic_region'].unique()]
    bp = ax5.boxplot(region_data, labels=results_df['genomic_region'].unique(), patch_artist=True)
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    ax5.set_title('Effect Scores by Genomic Region', fontweight='bold')
    ax5.set_ylabel('Cosine Score')
    ax5.tick_params(axis='x', rotation=45)
    
    # 6. Chromosome distribution
    ax6 = fig.add_subplot(gs[1, 2:])
    chrom_func = results_df.groupby('chromosome')['predicted_functional'].sum().head(15)
    chrom_total = results_df['chromosome'].value_counts().head(15)
    
    x_pos = np.arange(len(chrom_func))
    width = 0.35
    ax6.bar(x_pos - width/2, chrom_total[chrom_func.index], width, 
           label='Total', color=colors[0], alpha=0.7)
    ax6.bar(x_pos + width/2, chrom_func.values, width, 
           label='Functional', color=colors[2], alpha=0.7)
    ax6.set_xticks(x_pos)
    ax6.set_xticklabels(chrom_func.index, rotation=45)
    ax6.set_title('Variants by Chromosome', fontweight='bold')
    ax6.set_ylabel('Count')
    ax6.legend()
    
    # 7. Performance metrics radar
    ax7 = fig.add_subplot(gs[2, 0], projection='polar')
    metrics = ['Sensitivity', 'Specificity', 'PPV', 'NPV']
    values = [sensitivity, specificity, ppv, npv]
    angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
    values += values[:1]  # Complete the circle
    angles += angles[:1]
    
    ax7.plot(angles, values, 'o-', linewidth=2, color=colors[3])
    ax7.fill(angles, values, alpha=0.25, color=colors[3])
    ax7.set_xticks(angles[:-1])
    ax7.set_xticklabels(metrics)
    ax7.set_ylim(0, 1)
    ax7.set_title('Performance Metrics', fontweight='bold', pad=20)
    
    # 8. Score correlation heatmap
    ax8 = fig.add_subplot(gs[2, 1:3])
    score_cols = ['cosine_score', 'euclidean_score', 'manhattan_score', 'confidence']
    corr_matrix = results_df[score_cols].corr()
    im = ax8.imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
    ax8.set_xticks(range(len(score_cols)))
    ax8.set_yticks(range(len(score_cols)))
    ax8.set_xticklabels([col.replace('_', ' ').title() for col in score_cols], rotation=45)
    ax8.set_yticklabels([col.replace('_', ' ').title() for col in score_cols])
    ax8.set_title('Score Correlations', fontweight='bold')
    
    # Add correlation values
    for i in range(len(score_cols)):
        for j in range(len(score_cols)):
            ax8.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                    ha='center', va='center', 
                    color='white' if abs(corr_matrix.iloc[i, j]) > 0.6 else 'black')
    
    # 9. Top variants table
    ax9 = fig.add_subplot(gs[2, 3])
    ax9.axis('tight')
    ax9.axis('off')
    top_variants = results_df.nlargest(10, 'cosine_score')[['variant_id', 'cosine_score', 'variant_type']]
    table_data = []
    for _, row in top_variants.iterrows():
        table_data.append([row['variant_id'][:12], f"{row['cosine_score']:.3f}", row['variant_type'][:8]])
    
    table = ax9.table(cellText=table_data, 
                     colLabels=['Variant ID', 'Score', 'Type'],
                     cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)
    ax9.set_title('Top 10 Variants', fontweight='bold')
    
    plt.suptitle('VEP Analysis Dashboard', fontsize=20, fontweight='bold', y=0.98)
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"📊 Dashboard saved to: {save_path}")
    
    plt.show()

def create_variant_manhattan_plot(results_df: pd.DataFrame, save_path: Optional[str] = None) -> None:
    """
    Create a Manhattan plot showing variant effects across chromosomes.
    
    Args:
        results_df: DataFrame with VEP results
        save_path: Optional path to save the plot
    """
    fig, ax = plt.subplots(figsize=(16, 8))
    
    # Prepare data for Manhattan plot
    chromosomes = sorted(results_df['chromosome'].unique(), key=lambda x: (len(x), x))
    colors = ['#1f77b4', '#ff7f0e'] * (len(chromosomes) // 2 + 1)
    
    x_offset = 0
    x_ticks = []
    x_labels = []
    
    for i, chrom in enumerate(chromosomes):
        chrom_data = results_df[results_df['chromosome'] == chrom].copy()
        chrom_data = chrom_data.sort_values('position')
        
        # Normalize positions for plotting
        if len(chrom_data) > 0:
            x_positions = x_offset + np.arange(len(chrom_data))
            
            # Plot points
            scatter = ax.scatter(x_positions, chrom_data['cosine_score'], 
                               c=chrom_data['predicted_functional'], 
                               cmap='RdYlBu_r', alpha=0.7, s=30)
            
            # Track x-axis labels
            x_ticks.append(x_offset + len(chrom_data) / 2)
            x_labels.append(chrom.replace('chr', ''))
            
            x_offset += len(chrom_data) + 1000  # Add spacing between chromosomes
    
    # Customize plot
    ax.axhline(y=0.15, color='red', linestyle='--', alpha=0.8, label='Functional Threshold')
    ax.set_xlabel('Chromosome', fontsize=12)
    ax.set_ylabel('Variant Effect Score', fontsize=12)
    ax.set_title('Manhattan Plot: Variant Effects Across Genome', fontsize=14, fontweight='bold')
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Add colorbar
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Functional Prediction', rotation=270, labelpad=15)
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"🗺️ Manhattan plot saved to: {save_path}")
    
    plt.show()

# Create dashboard and Manhattan plot
print("🎨 Creating comprehensive VEP dashboard...")
create_vep_dashboard(vep_results, "vep_exports/vep_dashboard.png")

print("🗺️ Creating Manhattan plot...")
create_variant_manhattan_plot(vep_results, "vep_exports/manhattan_plot.png")

print("✅ Custom visualizations completed!")

## 8. Summary and Best Practices

### 🎯 Key Visualization Principles for VEP Analysis

1. **📊 Comprehensive Overview**
   - Always start with distribution plots to understand your data
   - Include both statistical and biological perspectives
   - Show both raw scores and interpreted predictions

2. **🔍 Interactive Exploration**
   - Use interactive plots for detailed data exploration
   - Enable filtering and zooming for large datasets
   - Provide hover information for context

3. **📈 Performance Visualization**
   - ROC and PR curves are essential for binary classification
   - Confusion matrices provide clear performance overview
   - Include confidence intervals when possible

4. **🧬 Biological Context**
   - Manhattan plots show genomic distribution
   - Variant type analysis reveals biological patterns
   - Genomic region analysis provides functional insights

5. **📤 Export Flexibility**
   - Multiple formats serve different downstream needs
   - Include metadata and analysis parameters
   - Generate human-readable reports alongside raw data

### 💡 Best Practices Summary

✅ **Do:**
- Create multiple complementary visualizations
- Include statistical significance testing
- Provide interactive versions for exploration
- Export in multiple formats for different users
- Include biological interpretation in reports

❌ **Avoid:**
- Showing only one type of visualization
- Ignoring statistical assumptions
- Creating static-only visualizations for large datasets
- Limiting export to single format
- Providing results without biological context

### 🚀 Next Steps

1. **🔬 Validation**: Compare predictions with experimental data
2. **📊 Scale**: Apply to larger genomic datasets
3. **🧬 Integration**: Combine with other genomic features
4. **🏥 Clinical**: Implement in clinical variant interpretation pipelines
5. **📱 Deployment**: Create web-based interactive dashboards

In [None]:
# Final summary and cleanup
print("🎉 VEP Visualization and Export Tutorial Complete!")
print("=" * 60)

print(f"\n📊 Generated Visualizations:")
print(f"  📈 Statistical overview plots")
print(f"  🔍 Interactive Plotly visualizations")
print(f"  📊 ROC and Precision-Recall curves")
print(f"  📄 Publication-ready figures")
print(f"  🎨 Comprehensive dashboard")
print(f"  🗺️ Manhattan plot")

print(f"\n📤 Export Formats Created:")
for format_type in exported_files.keys():
    print(f"  📄 {format_type.upper()} format")

print(f"\n💾 Output Directory: vep_exports/")
print(f"  📁 Contains all plots, data exports, and reports")

print(f"\n🎓 Key Learning Outcomes:")
print(f"  ✅ Comprehensive VEP result visualization")
print(f"  ✅ Statistical analysis and performance evaluation")
print(f"  ✅ Multiple export formats for different needs")
print(f"  ✅ Publication-ready plot generation")
print(f"  ✅ Interactive data exploration techniques")

print(f"\n🚀 Ready for:")
print(f"  🔬 Experimental validation")
print(f"  📊 Large-scale genomic analysis")
print(f"  🏥 Clinical variant interpretation")
print(f"  📱 Dashboard deployment")

print(f"\n✨ Tutorial completed successfully! ✨")