# pc_relate_2

In [None]:
# %env HAIL_QUERY_BACKEND=local

In [None]:
import hail as hl
# hl.utils.get_1kg('tmp/')
mt = hl.read_matrix_table('tmp/1kg.mt')
mt.count()

Compute results from current implementation located at `hail/hail/python/hail/methods/relatedness/pc_relate.py`:

In [None]:
%%time
pc_rel = hl.pc_relate(mt.GT, min_individual_maf=0.01, k=10)

Steps from current implementation that we can just reuse:

In [None]:
# Get PC scores
from hail.methods.pca import hwe_normalized_pca
_, scores, _ = hwe_normalized_pca(mt.GT, k=10, compute_loadings=False)
scores_expr = scores[mt.col_key].scores
scores_table = mt.select_cols(__scores=scores_expr)\
    .key_cols_by().select_cols('__scores').cols()

In [None]:
# Check for missing scores, create entries for g matrix
import hail.expr.aggregators as agg
n_missing = scores_table.aggregate(agg.count_where(hl.is_missing(scores_table.__scores)))
if n_missing > 0:
    raise ValueError(f'Found {n_missing} columns with missing scores array.')
mt = mt.select_entries(__gt=mt.GT.n_alt_alleles()).unfilter_entries()
mt = mt.annotate_rows(__mean_gt=agg.mean(mt.__gt))
mean_imputed_gt = hl.or_else(hl.float64(mt.__gt), mt.__mean_gt)

In [None]:
# Get PCs and g matrix
from hail.linalg import BlockMatrix
block_size = BlockMatrix.default_block_size()
g_bm = BlockMatrix.from_entry_expr(mean_imputed_gt, block_size=block_size).persist()
pcs = scores_table.collect(_localize=False).map(lambda x: x.__scores)

At this point, the current implementation calls:

```
ht = Table(ir.BlockMatrixToTableApply(g._bmir, pcs._ir, {
    'name': 'PCRelate',
    'maf': min_individual_maf,
    'blockSize': block_size,
    'minKinship': min_kinship,
    'statistics': {'kin': 0, 'kin2': 1, 'kin20': 2, 'all': 3}[statistics]
}))
```

So we want to replace the Scala code at `hail/hail/src/main/scala/is/hail/methods/PCRelate.scala` with Python that can run on the query backend.

Below we'll work out a new implementation, will just refer to it as `pc_relate_2`.

In [None]:
# Concat array of ones (intercept) with PCs, do QR
pcs_nd = hl.nd.array(pcs)
v_nd = hl.nd.concatenate([hl.nd.ones((pcs_nd.shape[0], 1)), pcs_nd], axis=1)
q_nd, r_nd = hl.nd.qr(v_nd, mode='reduced')
rinv_qt_nd = hl.nd.inv(r_nd) @ q_nd.T

In [None]:
# Check dims
nd_shapes = {
    'v': hl.eval(v_nd.shape),
    'q': hl.eval(q_nd.shape),
    'r': hl.eval(r_nd.shape),
    'rinv @ qT': hl.eval(rinv_qt_nd.shape)
}
print(nd_shapes)

In [None]:
# Convert inv(r) @ q.T to bm for computing beta
rinv_qt_bm = BlockMatrix.from_numpy(hl.eval(rinv_qt_nd))
beta_bm = (rinv_qt_bm @ g_bm.T).persist()

# Convert v to bm for computing mu
v_bm = BlockMatrix.from_numpy(hl.eval(v_nd))
mu_bm = (0.5 * (v_bm @ beta_bm).T).persist()

In [None]:
# Check dims again
bm_shapes = {
    'rinv @ qT': hl.eval(rinv_qt_bm.shape),
    'beta': hl.eval(beta_bm.shape),
    'v': hl.eval(v_bm.shape),
    'mu': hl.eval(mu_bm.shape),
    'g': hl.eval(g_bm.shape)
}
print(bm_shapes)

Define a few methods to use below to check the entries in mu and g matrices, as well as compute Gram matrix.

In [None]:
min_individual_maf = 0.01

def _bad_mu(mu, maf):
    return (mu <= maf) | (mu >= (1.0 - maf)) | (mu <= 0.0) | (mu >= 1.0)

def _bad_gt(gt):
    return (gt != hl.float64(0)) & (gt != hl.float64(1)) & (gt != hl.float64(2))

def _gram(M):
    return M.T @ M

Workaround without using Block Matrix `map` function.

Convert Block Matrices for g, mu to matrix tables to check bad values and get centered AFs. 

Annotate entries for mu^2, (1-mu)^2, variance, std_dev, centered_af dealing with bad values. 

Convert needed matrix table entries back to Block Matrices and compute estimate for phi.

In [None]:
# Define NaN to use instead of missing values, otherwise cannot go back to block matrix
nan = hl.literal(0) / 0

g_mt = g_bm.to_matrix_table_row_major()
g_mt = g_mt.annotate_entries(g = hl.if_else(_bad_gt(g_mt.element), nan, g_mt.element)).drop("element")

pre_mu_mt = mu_bm.to_matrix_table_row_major()
pre_mu_mt = pre_mu_mt.annotate_entries(pre_mu = hl.if_else(_bad_mu(pre_mu_mt.element, min_individual_maf), 
                                                           nan, 
                                                           pre_mu_mt.element)).drop("element")


# Use bm_mt to store entries for g, pre_mu, mu, var, and centered_af
bm_mt = g_mt.annotate_entries(pre_mu = pre_mu_mt[g_mt.row_idx, g_mt.col_idx].pre_mu)
bm_mt = bm_mt.annotate_entries(mu = hl.if_else(hl.is_nan(bm_mt.g) | hl.is_nan(bm_mt.pre_mu),
                                             nan,
                                             bm_mt.pre_mu))

bm_mt = bm_mt.annotate_entries(mu2 = hl.if_else(hl.is_nan(bm_mt.mu), 
                                                0.0, 
                                                bm_mt.mu ** 2), 
                               one_minus_mu2 = hl.if_else(hl.is_nan(bm_mt.mu), 
                                                          0.0, 
                                                          (1.0 - bm_mt.mu) ** 2),
                               variance = hl.if_else(hl.is_nan(bm_mt.mu),
                                                     0.0,
                                                     (bm_mt.mu * (1.0 - bm_mt.mu))), 
                               centered_af = hl.if_else(hl.is_nan(bm_mt.mu), 
                                                        0.0, 
                                                        (bm_mt.g / 2) - bm_mt.mu))
bm_mt = bm_mt.annotate_entries(std_dev = hl.sqrt(bm_mt.variance))
bm_mt.show(n_cols=1)

Now we can compute our estimate of phi, and compare to the existing pc_relate implementation in Hail:

In [None]:
centered_af_bm = BlockMatrix.from_entry_expr(bm_mt.centered_af)
std_dev_bm = BlockMatrix.from_entry_expr(bm_mt.std_dev)

phi_bm = (_gram(centered_af_bm) / _gram(std_dev_bm)).persist()
phi_bm.shape

Compute values needed to estimate $\widehat{k_{ij}^{(0)}}$, $\widehat{k_{ij}^{(1)}}$, and $\widehat{k_{ij}^{(2)}}$:

In [None]:
# Create table w/ entries from phi_bm, will use to store k0, k1, k2 estimates
results_ht = phi_bm.entries().rename({"entry": "kin"})
results_ht = results_ht.annotate(k0 = hl.missing(hl.tfloat64),
                                 k1 = hl.missing(hl.tfloat64), 
                                 k2 = hl.missing(hl.tfloat64))

In [None]:
# Create table w/ self-kinship (phi_ii) values
phi_ii_ht = phi_bm.diagonal().entries().key_by("j").drop("i").rename({"j": "idx", "entry": "phi_ii"})

# Annotate cols of bm_mt w/ self-kinship (phi_ii) and inbreeding coef (f_i)
bm_mt = bm_mt.annotate_cols(phi_ii = phi_ii_ht[bm_mt.col_idx].phi_ii, 
                            f_i = (2.0 * phi_ii_ht[bm_mt.col_idx].phi_ii) - 1.0)

In [None]:
# Create entries for dominance encoding of genotype matrix (gd and normalized_gd)
bm_mt = bm_mt.annotate_entries(gd = hl.case()
                               .when(hl.is_nan(bm_mt.mu), 0.0)
                               .when(bm_mt.g == 0.0, bm_mt.mu)
                               .when(bm_mt.g == 1.0, 0.0)
                               .when(bm_mt.g == 2.0, 1 - bm_mt.mu)
                               .default(nan))
bm_mt = bm_mt.annotate_entries(normalized_gd = bm_mt.gd - bm_mt.variance * (1 + bm_mt.f_i))

Now compute $\widehat{k_{ij}^{(2)}}$:

In [None]:
normalized_gd_bm = BlockMatrix.from_entry_expr(bm_mt.normalized_gd)
variance_bm = BlockMatrix.from_entry_expr(bm_mt.variance)
k2_bm = _gram(normalized_gd_bm) / _gram(variance_bm)

In [None]:
results_ht = results_ht.annotate(k2 = k2_bm.entries()[results_ht.i, results_ht.j].entry)
results_ht.show()

Finally we can compute $\widehat{k_{ij}^{(0)}}$, and then $\widehat{k_{ij}^{(1)}}$:

In [None]:
def _AtB_plus_BtA(A, B):
    temp = (A.T @ B).persist()
    return temp + temp.T

In [None]:
bm_mt = bm_mt.annotate_entries(hom_alt = hl.if_else((hl.is_nan(bm_mt.mu) | (bm_mt.g != 2.0)),
                                                    0.0,
                                                    1.0), 
                               hom_ref = hl.if_else((hl.is_nan(bm_mt.mu) | (bm_mt.g != 0.0)), 
                                                    0.0, 
                                                    1.0))
hom_alt_bm = BlockMatrix.from_entry_expr(bm_mt.hom_alt)
hom_ref_bm = BlockMatrix.from_entry_expr(bm_mt.hom_ref)
ibs0_bm = _AtB_plus_BtA(hom_alt_bm, hom_ref_bm)

mu2_bm = BlockMatrix.from_entry_expr(bm_mt.mu2)
one_minus_mu2_bm = BlockMatrix.from_entry_expr(bm_mt.one_minus_mu2)
k0_bm = (ibs0_bm / _AtB_plus_BtA(mu2_bm, one_minus_mu2_bm)).persist()

results_ht = results_ht.annotate(k0 = k0_bm.entries()[results_ht.i, results_ht.j].entry)
results_ht.show()

In [None]:
_k0_cutoff = 2.0**(-5/2)
results_ht = results_ht.annotate(k0 = hl.if_else(results_ht.kin <= _k0_cutoff, 
                                                 (1.0 - (4.0 * results_ht.kin) + results_ht.k2),
                                                 results_ht.k0))

results_ht = results_ht.annotate(k1 = 1 - results_ht.k2 - results_ht.k0)
results_ht.show()

In [None]:
col_keys = hl.literal(mt.select_cols().key_cols_by().cols().collect(), 
                      dtype=hl.tarray(mt.col_key.dtype))

pc_rel_2 = results_ht.key_by(i=col_keys[hl.int32(results_ht.i)], 
                             j=col_keys[hl.int32(results_ht.j)])

In [None]:
compare_ht = pc_rel.join(pc_rel_2)
compare_ht.show()

In [None]:
# Without self_kinship count should match pc_relate count
pc_rel_2 = pc_rel_2.filter(pc_rel_2.i == pc_rel_2.j, keep=False)
print(pc_rel_2.count() / 2)
print(pc_rel_count)