In [None]:
# Always import phasic first to set jax backend correctly
import phasic
import numpy as np
np.random.seed(42)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
matplotlib.rcParams['figure.figsize'] = (5, 3.7)
sns.set_context('paper', font_scale=0.9)
# import warnings
# warnings.filterwarnings(action='ignore', category=Warning, module='seaborn')
phasic.set_theme('dark')

# Standard Coalescent Model

This notebook demonstrates the standard coalescent model using phasic.

The coalescent models the genealogy of a sample backwards in time:
- Lineages coalesce (merge) at rate $\binom{k}{2}$ when there are $k$ lineages
- Process continues until all lineages have coalesced into a single ancestor

**Related notebooks:**
- `SFS.ipynb`: Site Frequency Spectrum computation
- `showcase.ipynb`: Comprehensive coalescent examples with rewards and moments

## Graph Construction

Build the coalescent graph by iteratively creating vertices for each possible state.

In [None]:
def build_coalescent(n):
    """
    Build coalescent graph for sample size n.
    
    State vector has n positions, where state[i] = number of lineages
    with i+1 descendants.
    
    Parameters
    ----------
    n : int
        Sample size
    
    Returns
    -------
    phasic.Graph
        Coalescent graph
    """
    graph = phasic.Graph(n)
    
    # Initial state: n lineages with 1 descendant each
    initial_state = np.zeros(n, dtype=int)
    initial_state[0] = n  # n lineages of class 1
    
    first_vertex = graph.find_or_create_vertex(initial_state)
    graph.starting_vertex().add_edge(first_vertex, 1.0)
    
    # Build graph iteratively
    index = 1
    while index < graph.vertices_length():
        vertex = graph.vertex_at(index)
        state = vertex.state()
        
        # Try coalescence between all classes
        for i in range(n):
            for j in range(i, n):
                # Class i has (i+1) descendants, class j has (j+1) descendants
                
                if i == j:
                    # Same class: need at least 2 lineages
                    if state[i] < 2:
                        continue
                    rate = state[i] * (state[i] - 1) / 2
                else:
                    # Different classes: need at least 1 in each
                    if state[i] < 1 or state[j] < 1:
                        continue
                    rate = state[i] * state[j]
                
                # Create child state
                child_state = state.copy()
                child_state[i] -= 1
                child_state[j] -= 1
                
                # Coalesced lineage has (i+1)+(j+1) descendants
                new_class = i + j + 1
                if new_class < n:
                    child_state[new_class] += 1
                    
                    child_vertex = graph.find_or_create_vertex(child_state)
                    vertex.add_edge(child_vertex, rate)
        
        index += 1
    
    return graph

## Example: n=4

In [None]:
n = 4
graph = build_coalescent(n)

print(f"Graph has {graph.vertices_length()} vertices")
print(f"Sample size: {n}")

In [None]:
# Visualize the graph
graph.plot()

## Moments

Compute moments of the time to most recent common ancestor (TMRCA).

In [None]:
# Expected TMRCA
expectation = graph.phase_type_moment(1)
print(f"E[TMRCA] = {expectation:.4f}")

# Variance
second_moment = graph.phase_type_moment(2)
variance = second_moment - expectation**2
print(f"Var[TMRCA] = {variance:.4f}")

# Higher moments
moments = [graph.phase_type_moment(k) for k in range(1, 5)]
print(f"\nMoments 1-4: {[f'{m:.4f}' for m in moments]}")

## TMRCA Distribution

In [None]:
# Compute PDF
times = np.linspace(0, expectation * 3, 100)
pdf = graph.pdf(times)

plt.figure(figsize=(8, 5))
plt.plot(times, pdf, linewidth=2, label='PDF')
plt.axvline(expectation, color='r', linestyle='--', label=f'E[T] = {expectation:.2f}')
plt.xlabel('Time')
plt.ylabel('Probability density')
plt.title(f'TMRCA Distribution (n={n})')
plt.legend()
plt.tight_layout()
sns.despine()
plt.show()

## Site Frequency Spectrum (SFS)

Compute expected branch lengths for each "ton" class using reward transformation.

**For detailed SFS analysis, see `SFS.ipynb`**

In [None]:
# Build reward vectors for each "ton" class
sfs = []

for i in range(n):
    # Reward vector: 1 for states with lineages in class i
    rewards = np.zeros(graph.vertices_length())
    for v_idx in range(graph.vertices_length()):
        vertex = graph.vertex_at(v_idx)
        state = vertex.state()
        rewards[v_idx] = state[i]  # Number of lineages with (i+1) descendants
    
    # Compute expected branch length
    reward_graph = graph.reward_transform(rewards)
    sfs.append(reward_graph.phase_type_moment(1))

print("Site Frequency Spectrum:")
for i, val in enumerate(sfs[:-1]):  # Exclude last class (n-ton = MRCA)
    print(f"  {i+1}-ton: {val:.4f}")

In [None]:
plt.figure(figsize=(8, 5))
plt.bar(range(1, n), sfs[:-1], color='C0', alpha=0.7)
plt.xlabel('Number of descendants (i)')
plt.ylabel('Expected branch length')
plt.title('Site Frequency Spectrum')
plt.xticks(range(1, n))
plt.tight_layout()
sns.despine()
plt.show()

## Scaling with Sample Size

In [None]:
# Compare TMRCA for different sample sizes
sample_sizes = [2, 3, 4, 5, 6, 8, 10]
results = []

for n in sample_sizes:
    g = build_coalescent(n)
    exp_t = g.phase_type_moment(1)
    results.append({
        'n': n,
        'vertices': g.vertices_length(),
        'E[TMRCA]': exp_t
    })

df = pd.DataFrame(results)
print("\nCoalescent scaling:")
print(df.to_string(index=False))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot graph size
axes[0].plot(df['n'], df['vertices'], marker='o', linewidth=2)
axes[0].set_xlabel('Sample size (n)')
axes[0].set_ylabel('Number of vertices')
axes[0].set_title('Graph Complexity vs Sample Size')

# Plot expected TMRCA  
axes[1].plot(df['n'], df['E[TMRCA]'], marker='o', linewidth=2, color='C1')
axes[1].set_xlabel('Sample size (n)')
axes[1].set_ylabel('E[TMRCA]')
axes[1].set_title('Expected TMRCA vs Sample Size')

plt.tight_layout()
sns.despine()
plt.show()

## Summary

This notebook demonstrates:
- Standard coalescent graph construction
- TMRCA moments and distribution
- Site Frequency Spectrum computation
- Scaling behavior with sample size

**Key insights:**
- E[TMRCA] â‰ˆ 2(1 - 1/n) for sample size n
- SFS is dominated by singletons (1-ton)
- Graph complexity grows with sample size

**For more advanced topics, see:**
- `SFS.ipynb`: Detailed SFS analysis including discrete mutation models
- `showcase.ipynb`: Covariance matrices, time-inhomogeneous models
- `im_model.ipynb`: Structured populations with migration
- `two-locus-arg.ipynb`: Ancestral recombination graphs