# pySCA 7.0 - Example Analysis Template

This notebook demonstrates the complete workflow for Statistical Coupling Analysis (SCA) using pySCA 7.0.

**Workflow Steps:**
1. Process multiple sequence alignment (MSA)
2. Run SCA core calculations
3. Identify independent components (ICs)
4. Visualize results

**Author:** pySCA Team  
**Date:** 2026


## Setup and Imports


In [3]:
import os
import numpy as np
from pathlib import Path

# Import pySCA utilities
from pysca import notebook_utils as nb
from pysca import scaTools as sca

# Import Plotly for interactive visualizations
import plotly.graph_objects as go
import plotly.express as px

# Create output directory if it doesn't exist
output_dir = Path("Outputs")
output_dir.mkdir(exist_ok=True)

print("Setup complete!")


Setup complete!


## Step 1: Process Multiple Sequence Alignment

Process the input alignment, filter sequences and positions, and create the ATS (Alignment-To-Structure) mapping.


In [2]:
# Define input files
alignment_file = "Inputs/alignment.fasta"  # Update with your alignment file
pdb_file = "Inputs/structure.pdb"  # Update with your PDB file (optional)
chain_id = "A"  # Chain ID in PDB
species = "Homo sapiens"  # Species name for reference sequence search (optional)

# Process the MSA
# For large alignments (>50k sequences), preclustering is automatically enabled
db_path = nb.process_msa(
    alignment=alignment_file,
    output_dir=output_dir,
    pdb=pdb_file,
    chain=chain_id,
    species=species,
    parameters=[0.25, 0.2, 0.15, 0.85],  # [pos_gap, seq_gap, min_seqid, max_seqid]
    precluster=None,  # None = auto-enable for >50k sequences
    cluster_id=0.85,  # MMseqs2 identity threshold
    matlab=True,  # Also write MATLAB workspace
    verbose=True
)

print(f"\nProcessed MSA database saved to: {db_path}")


FileNotFoundError: Inputs/alignment.fasta

## Step 2: Load and Inspect Processed Database


In [None]:
# Load the database
db = nb.load_database(db_path)

# Print summary
nb.print_summary(db)

# Access specific data
seq_data = db["sequence"]
print(f"\nNumber of sequences: {seq_data['M']}")
print(f"Number of effective sequences: {seq_data['M_eff']:.2f}")
print(f"Number of positions: {seq_data['L']}")
print(f"Reference sequence index: {seq_data['i_ref']}")


## Step 3: Run SCA Core Calculations

Compute the SCA matrix, positional weights, and optionally sequence correlations.


In [None]:
# Run SCA core calculations with independent component identification
output_db_path = nb.run_sca_core(
    database=db_path,
    norm="frob",  # Frobenius norm for matrix reduction
    Ntrials=10,  # Number of randomization trials
    lbda=0.01,  # Regularization parameter
    do_seqcorr=False,  # Set to True to compute sequence correlations (memory-intensive)
    do_sector_id=True,  # Perform independent component identification
    kpos=None,  # None = auto-select number of eigenmodes
    kica=None,  # None = use kpos for ICA
    sector_cutoff=0.95,  # T-distribution cutoff percentile
    kmax_cap=10,  # Maximum kpos when auto-selecting
    matlab=True,  # Also write MATLAB workspace
    verbose=True
)

print(f"\nSCA core database saved to: {output_db_path}")


## Step 4: Visualize Results


In [None]:
# Reload database with SCA and sector data
db = nb.load_database(output_db_path)

# Print updated summary
nb.print_summary(db)


### 4.1 Plot Eigenvalue Spectrum


In [None]:
# Plot eigenvalues with randomized comparison (Plotly - interactive)
fig = nb.plot_eigenvalues_plotly(db, n_modes=20, show_randomized=True)
fig.show()


### 4.2 Plot Independent Components


In [None]:
# Plot heatmap of all independent components (Plotly - interactive)
fig = nb.plot_ic_heatmap_plotly(db, ic_index=None, height=600, width=1000)
fig.show()


In [None]:
# Plot individual IC with significant positions highlighted (Plotly - interactive)
ic_to_plot = 0  # First IC (0-based indexing)
fig = nb.plot_ic_positions_plotly(db, ic_index=ic_to_plot, top_n=20)
fig.show()


### 4.4 Plot IC Correlation Matrix


In [None]:
# Plot Spearman correlation matrix between ICs (Plotly - interactive)
fig = nb.plot_ic_correlation_plotly(db)
fig.show()


### 4.5 Interactive Sequence Similarity Matrix (Optional)


In [None]:
# Uncomment to visualize sequence similarity matrix (requires --do-seqcorr)
# if "simMat" in db["sca"]:
#     fig = nb.plot_simmat_plotly(db, height=800, width=800)
#     fig.show()
# else:
#     print("simMat not available. Run sca-core with --do-seqcorr to compute sequence correlations.")


### 4.6 Access IC Data Programmatically


In [None]:
# Access sector/IC data
sector_data = db["sector"]

print(f"Number of eigenmodes (kpos): {sector_data.get('kpos', 'N/A')}")
print(f"Auto-estimated kpos: {sector_data.get('kpos_auto', 'N/A')}")
print(f"Number of independent components: {len(sector_data.get('ic_list', []))}")

# Print significant positions for each IC (in ATS numbering)
if "sector_ats" in sector_data:
    print("\nSignificant positions (ATS numbering) for each IC:")
    for i, ic_ats in enumerate(sector_data["sector_ats"]):
        print(f"  IC {i+1}: {len(ic_ats)} positions")
        if len(ic_ats) > 0:
            # Show first 20 positions
            print(f"    {ic_ats[:20]}")
