# Minimum variance theta

In [None]:
# Always import phasic first to set jax backend correctly
import phasic
import numpy as np
np.random.seed(42)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10, 7)
sns.set_context('paper', font_scale=1.4)
phasic.set_theme('dark')

## Generate standard coalescent state space

If you want to generate the state space:

In [None]:
def standard_coalescent(n):
    """
    Build standard coalescent graph for sample size n.
    
    State vector has n positions, where state[i] = number of lineages
    with (i+1) descendants.
    
    Parameters
    ----------
    n : int
        Sample size
    
    Returns
    -------
    phasic.Graph
        Coalescent graph
    """
    state_vector_length = n + 1
    graph = phasic.Graph(state_vector_length)
    starting_vertex = graph.starting_vertex()
    initial_state = np.zeros(n, dtype=int)
    initial_state[0] = n
    
    starting_vertex.add_edge(
        graph.find_or_create_vertex(initial_state),
        1
    )
    index = 1
    
    while index < graph.vertices_length():
        vertex = graph.vertex_at(index)
        
        # loop over all classes of lineages
        for i in range(n):
            for j in range(i, n):
                state = vertex.state()
                
                # if same class, there need to be at least two to coalesce
                if i == j:
                    if state[i] < 2:
                        continue
                    # coal rate
                    rate = state[i] * (state[i] - 1) / 2
                else:
                    # else at least one in each class to coalesce
                    if state[i] < 1 or state[j] < 1:
                        continue
                    # number of combinations
                    rate = state[i] * state[j]
                
                # copy state
                child_state = state.copy()
                # update child state
                child_state[i] = child_state[i] - 1
                child_state[j] = child_state[j] - 1
                child_state[i+j] = child_state[i+j] + 1
                
                vertex.add_edge(
                    graph.find_or_create_vertex(child_state),
                    rate
                )
        
        index = index + 1
    
    return graph

## Minimum variance theta

This notebook demonstrates minimum variance estimation of the population mutation rate parameter Î¸ (theta) from the site frequency spectrum (SFS).

The standard Watterson estimator uses equal weights for all frequency classes. However, because the SFS components have different variances and covariances, we can construct optimal weights that minimize the variance of the theta estimator.

The minimum variance unbiased linear estimator (MVUE) is given by:

$$\hat{\theta}_{MVUE} = \sum_{i=1}^{n-1} w_i \xi_i$$

where $\xi_i$ is the observed branch length for the i-ton class, and the weights $w$ are chosen to minimize variance subject to the constraint that the estimator is unbiased.

In [None]:
# Example 1: With specific theta value
theta = 0.00128
n = 4
graph = standard_coalescent(n)

# Get state matrix (transposed to match R layout)
state_matrix = np.array([graph.vertex_at(i).state() for i in range(graph.vertices_length())]).T
rewards = state_matrix

# Expected values under standard coalescent: theta * (1/1, 1/2, ..., 1/(n-1))
v = theta * (1 / np.arange(1, n))

# Get covariance matrix for the first (n-1) frequency classes
cov_mat = np.zeros((n-1, n-1))
for i in range(n-1):
    for j in range(n-1):
        cov_mat[i, j] = graph.covariance(rewards[i, :], rewards[j, :])

# Compute optimal weights
# A = (theta^2/4) * Cov^{-1} + diag(v)
A = (theta**2 / 4) * np.linalg.inv(cov_mat) + np.diag(v)

# w = A * v / (v^T * A * v)
w = A @ v / (v.T @ A @ v)

print("Optimal weights (with theta scaling):")
print(w)

In [None]:
# Verify that weights are normalized correctly
print(f"v^T * w = {v @ w}")

In [None]:
# Example 2: Scale-free formulation (without specific theta)
n = 4
graph = standard_coalescent(n)

# Get state matrix
state_matrix = np.array([graph.vertex_at(i).state() for i in range(graph.vertices_length())]).T
rewards = state_matrix

# Expected values (without theta scaling)
v = 1 / np.arange(1, n)

# Get covariance matrix
cov_mat = np.zeros((n-1, n-1))
for i in range(n-1):
    for j in range(n-1):
        cov_mat[i, j] = graph.covariance(rewards[i, :], rewards[j, :])

# Compute optimal weights (scale-free)
A = np.linalg.inv(cov_mat) + np.diag(v)
w = A @ v / (v.T @ A @ v)

print("Optimal weights (scale-free):")
print(w)

### Interpretation

Notice that the optimal weights can be negative! This is because:

1. The SFS components are correlated (positive covariances)
2. Some frequency classes have higher variance than others
3. The minimum variance estimator exploits these correlations

The weights tell us how much to weight each frequency class when estimating theta from observed SFS data.

In [None]:
# Example observed SFS (using expected values)
some_obs_sfs = v
print(f"Sum of observed SFS: {np.sum(some_obs_sfs)}")

In [None]:
# Theta estimate using optimal weights
theta_hat = np.sum(some_obs_sfs * w)
print(f"Theta estimate (optimal weights): {theta_hat}")

In [None]:
# Ratio of observed to weights (for diagnostic purposes)
print("Observed SFS / weights:")
print(some_obs_sfs / w)

In [None]:
# Compare to naive sum of weights
print(f"Sum(obs_sfs) * Sum(weights) = {np.sum(some_obs_sfs) * np.sum(w)}")

In [None]:
# Alternative: weighted sum using expected values as weights
print(f"Sum(obs_sfs * v) = {np.sum(some_obs_sfs * v)}")

## Visualizing the covariance structure

In [None]:
# Visualize covariance matrix
fig, ax = plt.subplots(figsize=(8, 6))
ticks = list(range(1, n))
sns.heatmap(cov_mat, cmap="PiYG", 
            annot=True, fmt=".4f",
            center=0,
            yticklabels=ticks,
            xticklabels=ticks,
            ax=ax)
ax.set_xlabel('Frequency class')
ax.set_ylabel('Frequency class')
ax.set_title('Covariance matrix of SFS branch lengths')
plt.tight_layout()
plt.show()

In [None]:
# Compare different weighting schemes
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(1, n)
width = 0.25

# Optimal weights
ax.bar(x - width, w, width, label='Optimal (MVUE)', alpha=0.8)

# Equal weights (normalized)
equal_weights = np.ones(n-1) / (n-1)
ax.bar(x, equal_weights, width, label='Equal weights', alpha=0.8)

# Watterson weights (1/i)
watterson_weights = v / np.sum(v)
ax.bar(x + width, watterson_weights, width, label='Watterson (1/i)', alpha=0.8)

ax.set_xlabel('Frequency class')
ax.set_ylabel('Weight')
ax.set_title('Comparison of weighting schemes for theta estimation')
ax.set_xticks(x)
ax.legend()
ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
sns.despine()
plt.tight_layout()
plt.show()

## Summary

This notebook demonstrates:

1. **Construction of standard coalescent graph** for computing SFS moments
2. **Covariance matrix computation** between different frequency classes
3. **Minimum variance unbiased linear estimator (MVUE)** for theta
4. **Optimal weights** that can be negative due to correlations in the SFS

**Key insights:**
- The SFS components are positively correlated
- Different frequency classes have different variances
- Optimal weights exploit these correlations to minimize estimation variance
- The MVUE can have negative weights for some frequency classes

**Related concepts:**
- Watterson estimator: uses weights proportional to 1/i
- Tajima's D: tests deviation from neutral SFS using variance structure
- Fu and Li's tests: also exploit correlations in the SFS

**For more details, see:**
- [Fu (1995)](https://doi.org/10.1093/genetics/140.1.435) - Statistical properties of SFS
- [Achaz (2009)](https://doi.org/10.1534/genetics.108.092908) - Frequency spectrum neutrality tests
- `SFS.ipynb`: Detailed SFS computation examples
- `showcase.ipynb`: Covariance computation examples