# Example tasks using vcfzarr on S3

In [1]:
!pip install zarr s3fs tqdm pandas numba



In [6]:
import zarr
import pandas as pd
import numpy as np
import dataclasses
import numba
import zarr
import numcodecs
import s3fs

numcodecs.blosc.set_nthreads(1)

1

In [7]:
S3PATH = "lifebit-user-data-1f2bfdf2-1d99-488c-9b87-246c62b66ea7/GEL-256534/"
REGION_START, REGION_END = 58219159, 60650943

In [8]:
def load_s3(path):
    s3 = s3fs.S3FileSystem(anon=False)
    store = s3fs.S3Map(root=path, s3=s3)
    return zarr.open(store, mode="r")

## Task 1 - Dump POS

In [9]:
%%time
root = load_s3(S3PATH)
pos = root['variant_position'][:]
start_index = np.searchsorted(pos, REGION_START, 'left')
end_index = np.searchsorted(pos, REGION_END, 'right')
region_pos = pos[start_index: end_index]
df = pd.DataFrame(region_pos)
df.to_csv("pos_result_vcfzarr.txt")

CPU times: user 15 s, sys: 1.06 s, total: 16.1 s
Wall time: 23.8 s


In [10]:
len(pos), len(region_pos)

(59880903, 562640)

## Task 1b - Dump POS to RAM only

In [12]:
%%time
root = load_s3(S3PATH)
pos = root['variant_position'][:]
start_index = np.searchsorted(pos, REGION_START, 'left')
end_index = np.searchsorted(pos, REGION_END, 'right')
region_pos = pos[start_index: end_index]

CPU times: user 13.9 s, sys: 733 ms, total: 14.6 s
Wall time: 18.2 s


## Task 2 - afdist

In [13]:
@dataclasses.dataclass
class GenotypeCounts:
    hom_ref: list
    hom_alt: list
    het: list
    ref_count: list

@numba.njit(
    "void(int64, int8[:,:,:], b1[:], b1[:], int32[:], int32[:], int32[:], int32[:])"
)
def count_genotypes_chunk_subset(
    offset, G, variant_mask, sample_mask, hom_ref, hom_alt, het, ref_count
):
    # NB Assuming diploids and no missing data!
    index = offset
    for j in range(G.shape[0]):
        if variant_mask[j]:
            for k in range(G.shape[1]):
                if sample_mask[k]:
                    a = G[j, k, 0]
                    b = G[j, k, 1]
                    if a == b:
                        if a == 0:
                            hom_ref[index] += 1
                        else:
                            hom_alt[index] += 1
                    else:
                        het[index] += 1
                    ref_count[index] += (a == 0) + (b == 0)
            index += 1

def classify_genotypes_subset(call_genotype, variant_mask, sample_mask):
    m = np.sum(variant_mask)

    # Use zarr arrays to get mask chunks aligned with the main data
    # for convenience.
    z_variant_mask = zarr.array(variant_mask, chunks=call_genotype.chunks[0])
    z_sample_mask = zarr.array(sample_mask, chunks=call_genotype.chunks[1])

    het = np.zeros(m, dtype=np.int32)
    hom_alt = np.zeros(m, dtype=np.int32)
    hom_ref = np.zeros(m, dtype=np.int32)
    ref_count = np.zeros(m, dtype=np.int32)
    j = 0
    # We should probably skip to the first non-zero chunk, but there probably
    # isn't much difference unless we have a huge number of chunks, and we're
    # only selecting a tiny subset
    for v_chunk in range(call_genotype.cdata_shape[0]):
        variant_mask_chunk = z_variant_mask.blocks[v_chunk]
        count = np.sum(variant_mask_chunk)
        if count > 0:
            for s_chunk in range(call_genotype.cdata_shape[1]):
                sample_mask_chunk = z_sample_mask.blocks[s_chunk]
                if np.sum(sample_mask_chunk) > 0:
                    G = call_genotype.blocks[v_chunk, s_chunk]
                    count_genotypes_chunk_subset(
                        j,
                        G,
                        variant_mask_chunk,
                        sample_mask_chunk,
                        hom_ref,
                        hom_alt,
                        het,
                        ref_count,
                    )
            j += count
    return GenotypeCounts(hom_ref, hom_alt, het, ref_count)
    
def zarr_afdist(path, num_bins=10, variant_slice=None, sample_slice=None):
    root = load_s3(path)
    call_genotype = root["call_genotype"]
    m = call_genotype.shape[0]
    n = call_genotype.shape[1]

    variant_mask = np.zeros(m, dtype=bool)
    variant_mask[variant_slice] = 1
    sample_mask = np.zeros(n, dtype=bool)
    sample_mask[sample_slice] = 1
    counts = classify_genotypes_subset(call_genotype, variant_mask, sample_mask)
    n = np.sum(sample_mask)

    alt_count = 2 * n - counts.ref_count
    af = alt_count / (n * 2)
    bins = np.linspace(0, 1.0, num_bins + 1)
    bins[-1] += 0.0125
    pRA = 2 * af * (1 - af)
    pAA = af * af
    a = np.bincount(np.digitize(pRA, bins), weights=counts.het, minlength=num_bins + 1)
    b = np.bincount(
        np.digitize(pAA, bins), weights=counts.hom_alt, minlength=num_bins + 1
    )
    count = (a + b).astype(int)

    return pd.DataFrame({"start": bins[:-1], "stop": bins[1:], "prob_dist": count[1:]})

In [14]:
%%time
df = zarr_afdist(S3PATH, num_bins=10, variant_slice=slice(start_index, end_index))
df

CPU times: user 2min 42s, sys: 1.59 s, total: 2min 43s
Wall time: 5min 14s


Unnamed: 0,start,stop,prob_dist
0,0.0,0.1,286405469
1,0.1,0.2,137172734
2,0.2,0.3,136385315
3,0.3,0.4,158273300
4,0.4,0.5,325497447
5,0.5,0.6,42187173
6,0.6,0.7,44968576
7,0.7,0.8,37326793
8,0.8,0.9,34890232
9,0.9,1.0125,44520767


## Task 3 - filtering on FORMAT fields

In [18]:
%%time
root = load_s3(S3PATH)
root_out = zarr.open("out.zarr")
DP = root['call_DP']
GQ = root['call_GQ']
num_variants = DP.shape[0]
num_samples = DP.shape[1]
variant_mask = np.zeros(num_variants, dtype=bool)
variant_mask[slice(start_index, end_index)] = 1
z_variant_mask = zarr.array(variant_mask, chunks=DP.chunks[0])
output_mask = np.zeros(num_variants, dtype=bool)
z_output_mask = zarr.array(output_mask, chunks=DP.chunks[0])

for v_chunk in range(DP.cdata_shape[0]):
    variant_mask_chunk = z_variant_mask.blocks[v_chunk]
    count = np.sum(variant_mask_chunk)
    if count > 0:
        output_mask_chunk = np.zeros_like(variant_mask_chunk)
        for s_chunk in range(DP.cdata_shape[1]):
            output_mask_chunk = np.logical_or(
                output_mask_chunk,
                np.any(
                    np.logical_and(
                        DP.blocks[v_chunk, s_chunk]>10, 
                        GQ.blocks[v_chunk, s_chunk]>20
                    ),
                    axis=(1)
                )
            )
        z_output_mask.blocks[v_chunk] = np.logical_and(output_mask_chunk, variant_mask_chunk)
root_out['variant_composite_filter'] = z_output_mask   

CPU times: user 3min 3s, sys: 9.88 s, total: 3min 12s
Wall time: 11min 9s
