# Load PCA/SM results for HGDP+1KG and UKB

In [None]:
from google.cloud import storage
import io
import numpy as np
import hail as hl

hl.init(spark_conf={"spark.driver.memory": "8g"})

## HGDP+1KG

### Functions to load the HGDP+1KG PCA/SM results:
  - `load_hgdp_1kg_globals` returns a 3-tuple of ndarrays, like (eigenvalues, spectral moments, standard errors)
  - `load_hgdp_1kg_scores` returns an ndarray of shape (k, n) with PC scores
  - `get_hgdp_1kg_count` returns a 2-tuple of integers, like (variant count, sample count)

Valid args for `subset`:
  - `subset="hgdp_1kg"` for the combined HGDP+1KG subset (2733 unrelated samples)
  - `subset="1kg"` for the 1KG only subset (2159 unrelated samples)

Valid args for `parity`:
  - `parity="full"` for the full dataset (no split)
  - `parity="odd"` for the odd chromosomes split
  - `parity="even"` for the even chromosomes split

Valid args for `window_size`:
  - If `parity="full"`, `window_size` must be one of (0, 3, 10, 30, 100)*
  - If `parity="odd"`, `window_size` must be one of (0, 3, 10)
  - If `parity="even"`, `window_size` must be one of (0, 3, 10)
  
*We ran whitening with `window_size=30` and `window_size=100` on the full dataset, but it was too much. That's why for the odd/even splits we only used smaller window sizes.

In [None]:
def _check_hgdp_1kg_args(subset, parity, window_size=None):
    valid_subsets = ("1kg", "hgdp_1kg")
    assert subset in valid_subsets, f"subset must be one of {valid_subsets}."
    valid_parities = ("full", "odd", "even")
    assert parity in valid_parities, f"parity must be one of {valid_parities}."
    if window_size:
        valid_ws = {
            "full": (0, 3, 10, 30, 100),
            "odd": (0, 3, 10),
            "even": (0, 3, 10),
        }
        assert window_size in valid_ws[parity], f"window_size must be one of {valid_ws[parity]}, when parity={parity}."


def load_hgdp_1kg_globals(subset, parity, window_size):
    try:
        _check_hgdp_1kg_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "hgdp_1kg":
            folder = "1kg_hgdp-2733-unrelated-samples"
        elif subset == "1kg":
            folder = "1kg-2159-unrelated-samples"
        gcs_prefix = f"1kg-hgdp-data/3412-samples/{folder}/pca-sm-whitened-02"
        ht = hl.read_table(f"gs://{gcs_prefix}/{parity}-scores-ws{window_size}-k50-MAF_0.01-r2_0.1.ht")
        eigvals = np.array(hl.eval(ht.eigenvalues))
        spectral_moments = np.array(hl.eval(ht.spectral_moments))
        std_errs = np.array(hl.eval(ht.standard_errors))
        return eigvals, spectral_moments, std_errs


def load_hgdp_1kg_scores(subset, parity, window_size):
    try:
        _check_hgdp_1kg_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "hgdp_1kg":
            folder = "1kg_hgdp-2733-unrelated-samples"
        elif subset == "1kg":
            folder = "1kg-2159-unrelated-samples"
        gcs_prefix = f"1kg-hgdp-data/3412-samples/{folder}/pca-sm-whitened-02"
        ht = hl.read_table(f"gs://{gcs_prefix}/{parity}-scores-ws{window_size}-k50-MAF_0.01-r2_0.1.ht")
        scores = np.array(ht.scores.collect()).T
        return scores


def get_hgdp_1kg_count(subset, parity):
    try:
        _check_hgdp_1kg_args(subset, parity)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        counts = {
            "hgdp_1kg": {
                "n": 2733,
                "m": {"full": 524351, "odd": 265079, "even": 259272},
            },
            "1kg": {
                "n": 2159,
                "m": {"full": 520108, "odd": 263215, "even": 256893},
            },
        }
        m_variants = counts[subset]["m"][parity]
        n_samples = counts[subset]["n"]
        return m_variants, n_samples

In [None]:
def compute_crosscorr(nd1, nd2, k):
    # Compute matrix of cross-correlations, take off-diagonal block, run SVD and return squared singular values
    R = np.corrcoef(nd1, nd2)[:k, k:]
    s = np.linalg.svd(R, compute_uv=False)
    return s**2

#### Write out a Table with sample IDs and superpopulations to use in `load_1kg_scores_and_superpops` function:

In [None]:
overwrite = False

# Load a 1KG scores table and the table with 1KG sample IDs/superpops/pops/etc.
scores_ht = hl.read_table("gs://1kg-hgdp-data/3412-samples/1kg-2159-unrelated-samples/pca-sm-whitened-02/full-scores-ws0-k50-MAF_0.01-r2_0.1.ht")
samples_ht = hl.read_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht").key_by()

# A few sample IDs in HGDP+1KG dataset need to be modified to get superpops
s_to_rename = hl.set({"NA12546", "NA12830", "NA18874"})
s_mapping = hl.dict({"NA12546": "NA12546B", "NA12830": "NA12830A", "NA18874": "NA18874A"})
samples_ht = samples_ht.annotate(
    s=hl.if_else(s_to_rename.contains(samples_ht.SampleID), 
                 s_mapping[samples_ht.SampleID], 
                 samples_ht.SampleID)
)
samples_ht = samples_ht.drop("metadata")
samples_ht = samples_ht.key_by("s")
samples_ht = samples_ht.select(*list(samples_ht.row_value))

In [None]:
# Write/read new table with 1KG sample IDs/superpops/pops/etc.
samples_ht = samples_ht.checkpoint(
    "gs://1kg/NYGC-30x-unphased/samples_and_populations.ht",
    overwrite=overwrite,
    _read_if_exists=not overwrite
)
samples_ht.describe()
samples_ht.show()

In [None]:
def load_1kg_scores_and_superpops(parity, window_size):
    subset = "1kg"
    try:
        _check_hgdp_1kg_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        folder = "1kg-2159-unrelated-samples"
        gcs_prefix = f"1kg-hgdp-data/3412-samples/{folder}/pca-sm-whitened-02"
        samples_ht = hl.read_table("gs://1kg/NYGC-30x-unphased/samples_and_populations.ht")
        scores_ht = hl.read_table(f"gs://{gcs_prefix}/{parity}-scores-ws{window_size}-k50-MAF_0.01-r2_0.1.ht")
        scores_ht = scores_ht.annotate(superpop=samples_ht[scores_ht.s].Superpopulation, 
                                       pop=samples_ht[scores_ht.s].Population)
        row_vals = scores_ht.row_value.collect()
        scores = np.array([x.scores for x in row_vals]).T
        superpops = np.array([x.superpop for x in row_vals])
        pops = np.array([x.pop for x in row_vals])
        return scores, superpops, pops

In [None]:
scores0, superpop0, pop0 = load_1kg_scores_and_superpops(parity="full", window_size=0)
scores0, superpop0, pop0

In [None]:
scores3, superpop3, pop3 = load_1kg_scores_and_superpops(parity="full", window_size=3)
scores3, superpop3, pop3

In [None]:
scores10, superpop10, pop10 = load_1kg_scores_and_superpops(parity="full", window_size=10)
scores10, superpop10, pop10

In [None]:
np.all(superpop0 == superpop3) and np.all(superpop0 == superpop10) and np.all(superpop3 == superpop10)

In [None]:
for p in ['full', 'odd', 'even']:
    for w in [0, 3, 10]:
        print(f"parity = {p}, w = {w}:")
        print(load_1kg_scores_and_superpops(p, w))
    print()

### HGDP+1KG example:

In [None]:
subset = "hgdp_1kg"
parity = "full"
window_size = 0

eigval, sm, stderr = load_hgdp_1kg_globals(subset, parity, window_size)
scores = load_hgdp_1kg_scores(subset, parity, window_size)
m_variants, n_samples = get_hgdp_1kg_count(subset, parity)

print(f"subset = {subset}, parity = {parity}, window_size = {window_size}")
print(f"variant count, sample count = {(m_variants, n_samples)}")
print(f"PC scores =\n{scores}")
print(f"eigenvalues =\n{eigval}")
print(f"spectral moments =\n{sm}")
print(f"std. errors =\n{stderr}")

### HGDP+1KG odd/even scores cross-correlation example:

In [None]:
subset = "hgdp_1kg"
window_size = 0

odd_scores = load_hgdp_1kg_scores(subset, "odd", window_size)
even_scores = load_hgdp_1kg_scores(subset, "even", window_size)
cross_corr = compute_crosscorr(even_scores, odd_scores, 50)
print(f"odd/even PC scores cross-correlation =\n{cross_corr}")

## UKB

### Functions to load the UKB PCA/SM results:
  - `load_ukb_globals` returns a 3-tuple of ndarrays, like (eigenvalues, spectral moments, standard errors)
  - `load_ukb_scores` returns an ndarray of shape (k, n) with PC scores
  - `get_ukb_count` returns a 2-tuple of integers, like (variant count, sample count)

Valid args for `subset`:
  - `subset="wb"` for the UKB White British subset (337,111 unrelated samples)
  - `subset="pan"` for the UKB Pan-ancestry subset (406,696 unrelated samples)

Valid args for `parity`:
  - `parity="full"` for the full dataset (no split)
  - `parity="odd"` for the odd chromosomes split
  - `parity="even"` for the even chromosomes split

Valid args for `window_size`:
  - `window_size` must be one of (0, 30, 100, 300)

In [None]:
def _check_ukb_args(subset, parity, window_size=None):
    valid_subsets = ("wb", "pan")
    assert subset in valid_subsets, f"subset must be one of {valid_subsets}."
    valid_parities = ("full", "odd", "even")
    assert parity in valid_parities, f"parity must be one of {valid_parities}."
    if window_size:
        valid_ws = (0, 30, 100, 300)
        assert window_size in valid_ws, f"window_size must be one of {valid_ws}."


def load_ukb_globals(subset, parity, window_size):
    try:
        _check_ukb_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "wb":
            samples = "337111-samples"
        elif subset == "pan":
            samples = "406696-samples"
        gcs_prefix = f"gs://ukb-data/genotypes/{samples}/pca-sm-whitened-02"
        ht = hl.read_table(f"{gcs_prefix}/{parity}-scores-ws{window_size}-k100.ht")
        eigvals = np.array(hl.eval(ht.eigenvalues))
        spectral_moments = np.array(hl.eval(ht.spectral_moments))
        std_errs = np.array(hl.eval(ht.standard_errors))
        return eigvals, spectral_moments, std_errs


def load_ukb_scores(subset, parity, window_size):
    try:
        _check_ukb_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "wb":
            samples = "337111-samples"
        elif subset == "pan":
            samples = "406696-samples"
        storage_client = storage.Client()
        bucket = storage_client.get_bucket("ukb-data")
        blob_prefix = f"genotypes/{samples}/pca-sm-whitened-02"
        blob = bucket.blob(f"{blob_prefix}/{parity}-scores-ws{window_size}-k100.npy")
        with io.BytesIO() as in_memory_file:
            blob.download_to_file(in_memory_file)
            in_memory_file.seek(0)
            ndarray = np.load(in_memory_file)
        return ndarray


def get_ukb_count(subset, parity):
    try:
        _check_ukb_args(subset, parity)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        counts = {
            "n": {"wb": 337111, "pan": 406696},
            "m": {"full": 147604, "odd": 74651, "even": 72953},
        }
        m_variants = counts["m"][parity]
        n_samples = counts["n"][subset]
        return m_variants, n_samples

### UKB example:

In [None]:
subset = "wb"
parity = "full"
window_size = 0

eigval, sm, stderr = load_ukb_globals(subset, parity, window_size)
scores = load_ukb_scores(subset, parity, window_size)
m_variants, n_samples = get_ukb_count(subset, parity)

print(f"subset = {subset}, parity = {parity}, window_size = {window_size}")
print(f"variant count, sample count = {(m_variants, n_samples)}")
print(f"PC scores =\n{scores}")
print(f"eigenvalues =\n{eigval}")
print(f"spectral moments =\n{sm}")
print(f"std. errors =\n{stderr}")
print()

### UKB odd/even scores cross-correlation example:

In [None]:
subset = "wb"
window_size = 0

odd_scores = load_ukb_scores(subset, "odd", window_size)
even_scores = load_ukb_scores(subset, "even", window_size)
cross_corr = compute_crosscorr(even_scores, odd_scores, 100)
print(f"odd/even PC scores cross-correlation =\n{cross_corr}")

# Load PCA/SM results for 1KG (NYGC 30x) - 2504/2030 sample sets

This section is to load the PCA/SM results for the 1KG (NYGC 30x) data that we started with initially (the 2504/2030 sample sets) for validation.

## 1KG (NYGC 30x)

### Functions to load the 1KG (NYGC 30x) PCA/SM results:
  - `load_1kg_globals` returns a 3-tuple of ndarrays, like (eigenvalues, spectral moments, standard errors)
  - `load_1kg_scores` returns an ndarray of shape (k, n) with PC scores
  - `get_1kg_count` returns a 2-tuple of integers, like (variant count, sample count)

Valid args for `subset`:
  - `subset="2504-samples"` for the subset from 1KG phase 3 (2504 unrelated samples)
  - `subset="2030-samples"` for the subset from TGP2261 (Gazal et. al) pruned for relatedness (2030 unrelated samples)

Valid args for `parity`:
  - `parity="full"` for the full dataset (no split)
  - `parity="odd"` for the odd chromosomes split
  - `parity="even"` for the even chromosomes split

Valid args for `window_size`:
  - `window_size` must be one of (0, 10, 50, 100)

In [None]:
def _check_1kg_args(subset, parity, window_size=None):
    valid_subsets = ("2504-samples", "2030-samples")
    assert subset in valid_subsets, f"subset must be one of {valid_subsets}."
    valid_parities = ("full", "odd", "even")
    assert parity in valid_parities, f"parity must be one of {valid_parities}."
    if window_size:
        valid_ws = (0, 10, 50, 100)
        assert window_size in valid_ws, f"window_size must be one of {valid_ws}."


def load_1kg_globals(subset, parity, window_size):
    try:
        _check_1kg_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "2504-samples":
            folder = subset
        elif subset == "2030-samples":
            folder = f"{subset}-gazal-KING"
        gcs_prefix = f"1kg/NYGC-30x-unphased/{folder}/pca-sm-whitened-02"
        ht = hl.read_table(f"gs://{gcs_prefix}/{parity}-scores-ws{window_size}-k50-MAF_0.01-r2_0.1.ht")
        eigvals = np.array(hl.eval(ht.eigenvalues))
        spectral_moments = np.array(hl.eval(ht.spectral_moments))
        std_errs = np.array(hl.eval(ht.standard_errors))
        return eigvals, spectral_moments, std_errs


def load_1kg_scores(subset, parity, window_size):
    try:
        _check_1kg_args(subset, parity, window_size)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        if subset == "2504-samples":
            folder = subset
        elif subset == "2030-samples":
            folder = f"{subset}-gazal-KING"
        gcs_prefix = f"1kg/NYGC-30x-unphased/{folder}/pca-sm-whitened-02"
        ht = hl.read_table(f"gs://{gcs_prefix}/{parity}-scores-ws{window_size}-k50-MAF_0.01-r2_0.1.ht")
        scores = np.array(ht.scores.collect()).T
        return scores


def get_1kg_count(subset, parity):
    try:
        _check_1kg_args(subset, parity)
    except AssertionError as exc:
        raise ValueError(exc) from None
    else:
        counts = {
            "2504-samples": {
                "n": 2504,
                "m": {"full": 656597, "odd": 333961, "even": 322636},
            },
            "2030-samples": {
                "n": 2030,
                "m": {"full": 665750, "odd": 338750, "even": 327000},
            },
        }
        m_variants = counts[subset]["m"][parity]
        n_samples = counts[subset]["n"]
        return m_variants, n_samples

### 1KG (NYGC 30x) example:

In [None]:
subset = "2504-samples"
parity = "full"
window_size = 0

eigval, sm, stderr = load_1kg_globals(subset, parity, window_size)
scores = load_1kg_scores(subset, parity, window_size)
m_variants, n_samples = get_1kg_count(subset, parity)

print(f"subset = {subset}, parity = {parity}, window_size = {window_size}")
print(f"variant count, sample count = {(m_variants, n_samples)}")
print(f"PC scores =\n{scores}")
print(f"eigenvalues =\n{eigval}")
print(f"spectral moments =\n{sm}")
print(f"std. errors =\n{stderr}")

### 1KG (NYGC 30x) odd/even scores cross-correlation example:

In [None]:
subset = "2504-samples"
parity = "full"

odd_scores = load_1kg_scores(subset, "odd", window_size)
even_scores = load_1kg_scores(subset, "even", window_size)
cross_corr = compute_crosscorr(even_scores, odd_scores, 50)
print(f"odd/even PC scores cross-correlation =\n{cross_corr}")