## Download VCFs

In [1]:
!mkdir -pv data/vcf

In [2]:
%%bash
cd data/vcf
for c in {1..22}
do
    wget --no-clobber ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/ALL.chr${c}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
done

File ‘ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retrieving.
File ‘ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz’ already there; not retr

## Find max number of ALT alleles 

In [3]:
import sys
import numpy as np
import dask
from dask import delayed
from dask import distributed
import allel
allel.__version__

'1.2.1'

In [4]:
cluster = distributed.LocalCluster(n_workers=4, threads_per_worker=1)
client = distributed.Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:41709  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 33.35 GB


In [5]:
def find_max_alt(chrom):
    import allel
    vcf_path = 'data/vcf/ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'.format(chrom)
    callset = allel.read_vcf(vcf_path, fields=['variants/numalt'])
    return callset['variants/numalt'].max()

In [6]:
# results = [delayed(find_max_alt)(chrom) for chrom in range(1, 23)]

In [7]:
# max_alt = dask.compute(results)

In [8]:
# max_alt
# [12, 7, 8, 6, 7, 9, 8, 8, 7, 7, 9, 6, 7, 7, 8, 7, 6, 7, 7, 6, 6, 8]

## Parse VCF to Zarr

In [18]:
def parse_vcf_to_zarr(chrom):
    import allel
    import zarr
    import numcodecs
    compressor = numcodecs.Blosc(cname='zstd', clevel=1, shuffle=-1)
    vcf_path = 'data/vcf/ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'.format(chrom)
    zarr_path = 'data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes'
    root = zarr.open(zarr_path, mode='a')
    if chrom in root:
        if root[chrom].attrs.get('complete', False):
            print('already done, skipping')
            return
        else:
            del root[chrom]
    allel.vcf_to_zarr(vcf_path, zarr_path, group=chrom, fields='*', alt_number=12,
                      compressor=compressor, log=sys.stdout, chunk_length=2**18,
                      chunk_width=128)
    root[chrom].attrs['complete'] = True


In [19]:
tasks = [delayed(parse_vcf_to_zarr)(chrom) for chrom in range(1, 23)]

In [20]:
dask.compute(tasks)

([None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],)

## Add sample data

In [1]:
import pandas as pd

In [6]:
df_samples = pd.read_csv('data/vcf/integrated_call_samples_v3.20130502.ALL.panel', 
                         sep='\t', 
                         usecols=['sample', 'pop', 'super_pop', 'gender'])
df_samples

Unnamed: 0,sample,pop,super_pop,gender
0,HG00096,GBR,EUR,male
1,HG00097,GBR,EUR,female
2,HG00099,GBR,EUR,female
3,HG00100,GBR,EUR,female
4,HG00101,GBR,EUR,male
...,...,...,...,...
2499,NA21137,GIH,SAS,female
2500,NA21141,GIH,SAS,female
2501,NA21142,GIH,SAS,female
2502,NA21143,GIH,SAS,female


In [9]:
import zarr
zarr_path = 'data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes'
callset = zarr.open(zarr_path, mode='a')
print(callset.tree(level=1))

/
 ├── 1
 ├── 10
 ├── 11
 ├── 12
 ├── 13
 ├── 14
 ├── 15
 ├── 16
 ├── 17
 ├── 18
 ├── 19
 ├── 2
 ├── 20
 ├── 21
 ├── 22
 ├── 3
 ├── 4
 ├── 5
 ├── 6
 ├── 7
 ├── 8
 └── 9


In [10]:
print(callset['1'].tree())

1
 ├── calldata
 │   └── GT (6468094, 2504, 2) int8
 ├── samples (2504,) object
 └── variants
     ├── AA (6468094,) object
     ├── AC (6468094, 12) int32
     ├── AF (6468094, 12) float32
     ├── AFR_AF (6468094, 12) float32
     ├── ALT (6468094, 12) object
     ├── AMR_AF (6468094, 12) float32
     ├── AN (6468094,) int32
     ├── CHROM (6468094,) object
     ├── CIEND (6468094, 2) int32
     ├── CIPOS (6468094, 2) int32
     ├── CS (6468094,) object
     ├── DP (6468094,) int32
     ├── EAS_AF (6468094, 12) float32
     ├── END (6468094,) int32
     ├── EUR_AF (6468094, 12) float32
     ├── EX_TARGET (6468094,) bool
     ├── FILTER_PASS (6468094,) bool
     ├── ID (6468094,) object
     ├── IMPRECISE (6468094,) bool
     ├── MC (6468094,) object
     ├── MEINFO (6468094, 4) object
     ├── MEND (6468094,) int32
     ├── MLEN (6468094,) int32
     ├── MSTART (6468094,) int32
     ├── MULTI_ALLELIC (6468094,) bool
     ├── NS (6468094,) int32
     ├── POS (6468094,) int32
     ├── 

In [12]:
# Check samples from VCF same as panel
samples_from_vcf = callset['1/samples'][:].tolist()
samples_from_vcf[:5]

['HG00096', 'HG00097', 'HG00099', 'HG00100', 'HG00101']

In [14]:
samples_from_panel = df_samples['sample'].tolist()
samples_from_panel[:5]

['HG00096', 'HG00097', 'HG00099', 'HG00100', 'HG00101']

In [17]:
samples_from_vcf == samples_from_panel

True

In [18]:
callset.require_group('samples')

<zarr.hierarchy.Group '/samples'>

In [20]:
import numcodecs

In [21]:
callset.create_dataset('samples/ID', data=df_samples['sample'], compressor=None, chunks=None, object_codec=numcodecs.VLenUTF8())

<zarr.core.Array '/samples/ID' (2504,) object>

In [22]:
callset.create_dataset('samples/pop', data=df_samples['pop'], compressor=None, chunks=None, object_codec=numcodecs.VLenUTF8())

<zarr.core.Array '/samples/pop' (2504,) object>

In [23]:
callset.create_dataset('samples/super_pop', data=df_samples['super_pop'], compressor=None, chunks=None, object_codec=numcodecs.VLenUTF8())

<zarr.core.Array '/samples/super_pop' (2504,) object>

In [24]:
callset.create_dataset('samples/gender', data=df_samples['gender'], compressor=None, chunks=None, object_codec=numcodecs.VLenUTF8())

<zarr.core.Array '/samples/gender' (2504,) object>

In [26]:
print(callset.tree(level=1))

/
 ├── 1
 ├── 10
 ├── 11
 ├── 12
 ├── 13
 ├── 14
 ├── 15
 ├── 16
 ├── 17
 ├── 18
 ├── 19
 ├── 2
 ├── 20
 ├── 21
 ├── 22
 ├── 3
 ├── 4
 ├── 5
 ├── 6
 ├── 7
 ├── 8
 ├── 9
 └── samples


In [27]:
print(callset['samples'].tree())

samples
 ├── ID (2504,) object
 ├── gender (2504,) object
 ├── pop (2504,) object
 └── super_pop (2504,) object


In [33]:
callset['samples/ID'][:]

array(['HG00096', 'HG00097', 'HG00099', ..., 'NA21142', 'NA21143',
       'NA21144'], dtype=object)

## Consolidate metadata

In [34]:
zarr.consolidate_metadata(zarr_path)

<zarr.hierarchy.Group '/'>

In [36]:
print(zarr.open_consolidated(zarr_path).tree(level=1))

/
 ├── 1
 ├── 10
 ├── 11
 ├── 12
 ├── 13
 ├── 14
 ├── 15
 ├── 16
 ├── 17
 ├── 18
 ├── 19
 ├── 2
 ├── 20
 ├── 21
 ├── 22
 ├── 3
 ├── 4
 ├── 5
 ├── 6
 ├── 7
 ├── 8
 ├── 9
 └── samples


## Upload to GCS

In [43]:
!gsutil -m rsync -r data/zarr/ gs://1000genomes-zarr/

Building synchronization state...
At source listing 10000...
Starting synchronization...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.11.0 [Content-Type=application/octet-stream]...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.12.0 [Content-Type=application/octet-stream]...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.13.0 [Content-Type=application/octet-stream]...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.14.0 [Content-Type=application/octet-stream]...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.15.0 [Content-Type=application/octet-stream]...
Copying file://data/zarr/ALL.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes/1/calldata/GT/0.18.0 [Content-Type=application/octet-stream]...
Copyi