In [None]:
# Always import phasic first to set jax backend correctly
import phasic
import numpy as np
np.random.seed(42)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8, 5.5)
sns.set_context('paper', font_scale=0.9)
phasic.set_theme('dark')

# Distribution of Branch Lengths Conditioned on Selection on a Derived Variant

This notebook explores coalescent models with natural selection affecting a derived genetic variant.

**Key concepts:**
- Coalescent process with selection coefficient s
- Conditioning on allele frequency bins
- Branch length distributions for ancestral vs derived lineages
- Sanity checks on expected coalescent times

**Note:** This notebook demonstrates theoretical calculations for coalescent models with selection. The Python API does not currently expose the C++ `construct_coalescent_selection_graph` function, so we focus on manual graph construction and validation of basic coalescent properties.

## Utility Functions

In [None]:
def state_legend(graph, sample_size):
    """
    Print a legend showing the state vectors and their interpretation.
    
    Parameters
    ----------
    graph : phasic.Graph
        The coalescent graph
    sample_size : int
        Sample size
    """
    print('state\t\t\ttons (anc/der)\t\tfreq bin')
    print('\t\t', '\t'.join(str(i) for i in range(1, sample_size)), sep='\t')
    
    for v_idx in range(graph.vertices_length()):
        vertex = graph.vertex_at(v_idx)
        state = vertex.state()
        
        nr_anc_tons = np.zeros(sample_size - 1, dtype=int)
        nr_der_tons = np.zeros(sample_size - 1, dtype=int)
        
        # Parse state vector to count ancestral and derived tons
        # (Implementation depends on specific state encoding)
        
        print(','.join(str(s) for s in state), end='\t')
        for i in range(sample_size - 1):
            print(f"{nr_der_tons[i]}/{nr_anc_tons[i]}", end='\t')
        print(state[-1] if len(state) > 0 else 0)

## Standard Coalescent

Before exploring selection models, let's verify basic coalescent properties.

### Construction from Matrix

If you already have the sub-intensity matrix (SIM) and initial probability vector (IPV):

In [None]:
# Standard coalescent for n=4
# States represent number of lineages: 4 -> 3 -> 2 -> 1 (absorbing)
sim = np.array([
    [-6,  6,  0,  0],
    [ 0, -3,  1,  2],
    [ 0,  0, -1,  0],
    [ 0,  0,  0, -1]
], dtype=float)

ipv = np.array([1, 0, 0, 0], dtype=float)

print("Sub-intensity matrix:")
print(sim)
print(f"\nRow sums (should be â‰¤ 0): {sim.sum(axis=1)}")

## Sanity Checks

Verify theoretical expectations for specific coalescent scenarios.

### Single Ancestral, Single Derived

With $n=2$ and $d=1$, the expectation must be 2: 1 (the coal time of all derived, which make up half the population, so 2/2) + 1 (the coal time of two ancestral sequences in a population of all ancestral):

$$ N + \mu \rightarrow 1 + \mu/N = 1 + 1 = 2 $$

The two rates are $\frac{1}{N}$ and $\mu$. Scaled they are 1 and $N\mu$. So the total expectation is $1 + 1/N\mu$. Since we have sampled one derived and one ancestral, the mean of the prior freq is 0.5, which implies that the prior on $N\mu$ is 1 because our single observation produces the expectation that for two randomly sampled individuals, a mutation happens on one of the two branches with probability one, producing a pairwise diversity of $2N\mu = 0.5$ so $N\mu = 1$.

In [None]:
# For n=2, d=1, the expected TMRCA should be approximately 2
# This would require the coalescent-selection graph constructor
# (not currently exposed in Python API)

# Theoretical expectation
print("Expected TMRCA for n=2, d=1: ~2.0")
print("(Exact value depends on N and selection coefficient)")

### Many Ancestral, Single Derived

With very large $n$ and $d=1$, the expectation converges to the total tree height ($2N$ for haploid, i.e. 2), with the small extra waiting time as the mean of the prior on $N\mu$ is:

$$2N\mu = \frac{n}{\binom{n}{2}} = \frac{n(n-1)}{2} = \frac{2}{n-1} \rightarrow N\mu = \frac{1}{n-1} $$

So the expectation should be $2 + \frac{1}{n-1}$, which for $n=20$ and $d=1$ would be $\approx 2.05263$:

In [None]:
# Theoretical calculation for n=20, d=1
n = 20
expected_tmrca = 2 + 1/(n-1)
print(f"Theoretical E[TMRCA] for n={n}, d=1: {expected_tmrca:.5f}")

### Many Ancestral vs Many Derived

The following two scenarios should have the same expectation:
- $n=3$, $d=2$ (2 derived, 1 ancestral)
- $n=3$, $d=1$ (1 derived, 2 ancestral)

This is due to symmetry in the coalescent process.

In [None]:
# Build simple coalescent for n=3
def build_simple_coalescent(n):
    """Build standard coalescent graph for n lineages."""
    graph = phasic.Graph(1)
    
    # State = number of lineages remaining
    first_vertex = graph.find_or_create_vertex(np.array([n], dtype=int))
    graph.starting_vertex().add_edge(first_vertex, 1.0)
    
    index = 1
    while index < graph.vertices_length():
        vertex = graph.vertex_at(index)
        k = vertex.state()[0]  # Number of lineages
        
        if k > 1:
            # Coalescence rate: k choose 2
            rate = k * (k - 1) / 2.0
            child_vertex = graph.find_or_create_vertex(np.array([k - 1], dtype=int))
            vertex.add_edge(child_vertex, rate)
        
        index += 1
    
    return graph

# Build and compute expectation
graph_3 = build_simple_coalescent(3)
exp_tmrca_3 = graph_3.phase_type_moment(1)
print(f"E[TMRCA] for n=3: {exp_tmrca_3:.6f}")
print(f"Theoretical: {2*(1 - 1/3):.6f}")

### Two Derived and Two Ancestral

For $n=4$ with $d=2$ (2 derived, 2 ancestral), the expected TMRCA involves:
- Initial coalescence within derived or ancestral groups
- Eventual coalescence between the two groups

The calculation considers multiple paths through the state space.

In [None]:
# Manual calculation of expected TMRCA
# This matches the R calculation in the original notebook
exp_manual = (1/4 + 1/2*(2/3 + 1 + 1) + 
              1/2*(6/15 + 6/15*(1+1) + 9/15*(1/3 + 1/3*(1) + 2/3*(1/3))))
print(f"Manual calculation: {exp_manual:.6f}")

# Build standard coalescent for n=4
graph_4 = build_simple_coalescent(4)
exp_tmrca_4 = graph_4.phase_type_moment(1)
print(f"Standard coalescent E[TMRCA] for n=4: {exp_tmrca_4:.6f}")
print(f"Theoretical: {2*(1 - 1/4):.6f}")

### Coalescence Rate Calculations

Understanding the correct coalescence rates is critical for selection models.

In [None]:
# First scenario: incorrect rates (1/2, not 1/6)
scenario_1 = ((1/2 + 1/3 + 1) + (1/2 + 1/2 + 1)) / 2
print(f"Scenario 1 (1/2 rates): {scenario_1:.6f}")
print(f"Alternative: 1 + 11/12 = {1 + 11/12:.6f}")

# Second scenario: with 1/6 rates
scenario_2 = ((1/2 + 1/6 + 1/2) + (1/2 + 1/2 + 1)) / 2
print(f"Scenario 2 (1/6 rates): {scenario_2:.6f}")

# Third scenario: accounting for derived frequency 0.5
scenario_3 = ((1/4 + 1/3 + 1) + (1/4 + 1/4 + 1)) / 2
print(f"Scenario 3 (freq 0.5): {scenario_3:.6f}")

## Notes on Selection Models

**Key insight:** The derived frequency must be properly accounted for in the coalescence rates. When the derived frequency is forced to zero, coalescence between ancestral and derived lineages occurs at the standard rate (1).

**Implementation considerations:**
- The problem is that the derived freq is not forced to zero, making the derived freqs larger than they should be
- This is solved by only allowing coalescence between ancestral and derived after the derived freq has been fixed at zero (where coal rate is 1)
- Selection coefficient $s$ modifies the coalescence rates based on allele frequency
- Frequency bins discretize the continuous frequency trajectory

## Graph Visualization

Visualize the structure of a simple coalescent graph.

In [None]:
# Build and visualize n=4 coalescent
graph = build_simple_coalescent(4)

print(f"Graph has {graph.vertices_length()} vertices")
print("\nVertex states (number of lineages):")
for i in range(graph.vertices_length()):
    vertex = graph.vertex_at(i)
    state = vertex.state()
    print(f"  Vertex {i}: {state[0]} lineages")

# Plot the graph
graph.plot()

## Moment Analysis

Compare expected TMRCA across different sample sizes.

In [None]:
# Compare expectations for different sample sizes
sample_sizes = [2, 3, 4, 5, 6, 8, 10, 15, 20]
results = []

for n in sample_sizes:
    graph = build_simple_coalescent(n)
    exp_t = graph.phase_type_moment(1)
    theoretical = 2 * (1 - 1/n)
    
    results.append({
        'n': n,
        'vertices': graph.vertices_length(),
        'E[TMRCA]': exp_t,
        'Theoretical': theoretical,
        'Error': abs(exp_t - theoretical)
    })

df = pd.DataFrame(results)
print("\nCoalescent TMRCA expectations:")
print(df.to_string(index=False))

In [None]:
# Plot expectation vs sample size
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# E[TMRCA] vs n
axes[0].plot(df['n'], df['E[TMRCA]'], 'o-', linewidth=2, markersize=8, label='Computed')
axes[0].plot(df['n'], df['Theoretical'], '--', linewidth=2, label='Theoretical: 2(1-1/n)')
axes[0].set_xlabel('Sample size (n)', fontsize=12)
axes[0].set_ylabel('E[TMRCA]', fontsize=12)
axes[0].set_title('Expected TMRCA vs Sample Size', fontsize=13)
axes[0].legend()
axes[0].grid(alpha=0.3)

# Error analysis
axes[1].semilogy(df['n'], df['Error'], 'o-', linewidth=2, markersize=8, color='C3')
axes[1].set_xlabel('Sample size (n)', fontsize=12)
axes[1].set_ylabel('Absolute error', fontsize=12)
axes[1].set_title('Numerical Error', fontsize=13)
axes[1].grid(alpha=0.3)

plt.tight_layout()
sns.despine()
plt.show()

## PDF Analysis

Examine the probability density function of TMRCA.

In [None]:
# Compute PDFs for different sample sizes
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, n in enumerate([3, 5, 10, 20]):
    graph = build_simple_coalescent(n)
    exp_t = graph.phase_type_moment(1)
    
    times = np.linspace(0, exp_t * 4, 200)
    pdf = graph.pdf(times)
    
    axes[idx].plot(times, pdf, linewidth=2.5, color=f'C{idx}')
    axes[idx].axvline(exp_t, color='red', linestyle='--', linewidth=2, 
                     label=f'E[T] = {exp_t:.3f}')
    axes[idx].fill_between(times, pdf, alpha=0.3, color=f'C{idx}')
    axes[idx].set_xlabel('Time', fontsize=11)
    axes[idx].set_ylabel('Probability density', fontsize=11)
    axes[idx].set_title(f'TMRCA Distribution (n={n})', fontsize=12)
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
sns.despine()
plt.show()

## Summary

This notebook demonstrates:

1. **Theoretical expectations** for coalescent models with selection
2. **Sanity checks** validating coalescence time calculations
3. **Rate calculations** for different frequency configurations
4. **Standard coalescent** as a baseline for comparison

**Key insights:**
- E[TMRCA] = 2(1 - 1/n) for standard coalescent with sample size n
- Selection modifies coalescence rates based on derived allele frequency
- Proper handling of frequency bins is critical for accurate selection models
- Symmetry: (n, d) and (n, n-d) scenarios have identical expectations

**Related notebooks:**
- `coalescent.ipynb`: Standard coalescent without selection
- `SFS.ipynb`: Site frequency spectrum computation
- `showcase.ipynb`: Advanced coalescent examples with rewards