# Stage 4: Final Correlation Analysis

This notebook combines morphogen-regulon networks and calculates final correlations for publication.

**Input**: Morphogen-regulon networks from Stage 3
**Output**: Final correlation matrices, TF-target-morphogen relationships
**Method**: Correlation analysis, statistical testing, visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import warnings
warnings.filterwarnings('ignore')

# Import correlation analysis utilities
import sys
sys.path.append('../src')
from correlation_analysis import (
    calculate_correlations,
    create_correlation_matrix,
    plot_correlation_heatmap,
    plot_correlation_distribution,
    plot_network_graph,
    calculate_summary_statistics,
    save_correlation_results
)

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("📈 Stage 4: Final Correlation Analysis")
print("Combining morphogen-regulon networks for publication")

## 4.1 Load Network Data

Load the morphogen-regulon networks from Stage 3.

In [None]:
# Load combined network results
network_file = "../03_morphogen_networks/networks/morphogen_regulon_networks_combined.csv"

if os.path.exists(network_file):
    networks = pd.read_csv(network_file)
    print(f"Loaded combined networks: {networks.shape[0]} interactions")
    print(f"Columns: {list(networks.columns)}")
    print(f"\nSample data:")
    print(networks.head())
else:
    print(f"❌ Network file not found: {network_file}")
    print("Please run Stage 3 first")
    networks = None

In [None]:
# Load individual cell line networks for comparison
individual_networks = {}

for cell_line in ['H1', 'WTC', 'H9', 'WIBJ2']:
    network_file = f"../03_morphogen_networks/networks/morphogen_regulon_network_{cell_line}.csv"
    
    if os.path.exists(network_file):
        df = pd.read_csv(network_file)
        individual_networks[cell_line] = df
        print(f"Loaded {cell_line}: {df.shape[0]} interactions")
    else:
        print(f"❌ {cell_line} network not found")

print(f"\nLoaded networks for {len(individual_networks)} cell lines")

## 4.2 Calculate Correlations

Calculate correlations between morphogens and regulon activities using the correlation analysis utilities.

In [None]:
# Calculate correlations for combined data
if networks is not None:
    print("Calculating correlations from combined networks...")
    correlations_combined = calculate_correlations(networks)
    print(f"Significant correlations: {len(correlations_combined)}")
    
    # Show top correlations
    top_corr = correlations_combined.nlargest(10, 'correlation')
    print("
Top 10 morphogen-regulon correlations:")
    for _, row in top_corr.iterrows():
        print(f"  {row['morphogen']} -> {row['regulon']}: r={row['correlation']:.3f} ({row['cell_line']})")
else:
    correlations_combined = None

In [None]:
# Calculate correlations for individual cell lines
correlations_individual = {}

for cell_line, network_df in individual_networks.items():
    print(f"\\nCalculating correlations for {cell_line}...")
    corr_df = calculate_correlations(network_df)
    corr_df['cell_line'] = cell_line
    correlations_individual[cell_line] = corr_df
    
    print(f"  Significant correlations: {len(corr_df)}")
    
    # Show top correlations
    if len(corr_df) > 0:
        top_corr = corr_df.nlargest(5, 'correlation')
        print(f"  Top correlations:")
        for _, row in top_corr.iterrows():
            print(f"    {row['morphogen']} -> {row['regulon']}: r={row['correlation']:.3f}")

## 4.3 Create Correlation Matrix

Create a comprehensive correlation matrix for visualization and analysis using the utility functions.

In [None]:
# Create correlation matrices
if correlations_combined is not None:
    print("Creating correlation matrix...")
    corr_matrix = create_correlation_matrix(correlations_combined)
    print(f"Matrix shape: {corr_matrix.shape}")
    print(f"Morphogens: {corr_matrix.shape[0]}")
    print(f"Regulons: {corr_matrix.shape[1]}")
    
    # Show matrix info
    non_zero = (corr_matrix != 0).sum().sum()
    total = corr_matrix.shape[0] * corr_matrix.shape[1]
    print(f"Non-zero correlations: {non_zero}/{total} ({100*non_zero/total:.1f}%)")
else:
    corr_matrix = None

## 4.4 Visualizations

Create publication-quality visualizations of the results.

In [None]:
# Create output directory for plots
os.makedirs("plots", exist_ok=True)

# Plot 1: Heatmap of correlation matrix
if corr_matrix is not None:
    fig = plot_correlation_heatmap(
        corr_matrix, 
        top_n=20, 
        figsize=(12, 8), 
        output_path='plots/correlation_heatmap.png'
    )
    plt.show()
    print("✅ Saved correlation heatmap")

In [None]:
# Plot 2: Distribution of correlations by cell line
if correlations_combined is not None:
    fig = plot_correlation_distribution(
        correlations_combined, 
        figsize=(10, 6), 
        output_path='plots/correlation_distribution.png'
    )
    plt.show()
    print("✅ Saved correlation distribution plot")

In [None]:
# Plot 3: Network graph of top interactions
if correlations_combined is not None:
    fig = plot_network_graph(
        correlations_combined,
        top_n=50,
        top_edges=20,
        figsize=(12, 10),
        output_path='plots/network_graph.png'
    )
    plt.show()
    print("✅ Saved network graph")

## 4.5 Save Final Results

Save all results for publication and further analysis.

In [None]:
# Calculate and save summary statistics
summary_stats = calculate_summary_statistics(correlations_individual)

# Save all results using the utility function
save_correlation_results(
    correlations_combined=correlations_combined,
    correlations_individual=correlations_individual,
    corr_matrix=corr_matrix,
    summary_stats=summary_stats,
    output_dir="."
)

# Display summary
print("
📊 Final Summary:")
print(summary_stats.to_string(index=False))

print("
🎉 Stage 4 Complete!")
print("
📁 Output files:")
print("  - final_correlations_combined.csv")
print("  - correlation_matrix.csv")
print("  - final_correlations_[cellline].csv")
print("  - summary_statistics.csv")
print("  - plots/correlation_heatmap.png")
print("  - plots/correlation_distribution.png")
print("  - plots/network_graph.png")

print("
🚀 Pipeline complete! Ready for publication.")