# MalariaGen Directory and Data Structure

This notebook describes the directory and the data structure of the MalariaGen Phase2 Variant data on Google Cloud Storage.

# Imports

In [13]:
import numpy as np
import zarr
import pandas as pd
import dask.array as da
import allel
from pprint import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=30)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.35.63.92:38717
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

distributed.scheduler - INFO - Register tcp://10.32.38.247:38543
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.32.38.247:38543
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.160.22:43789
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.160.22:43789
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.231.3:43433
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.231.3:43433
distributed.core - INFO - Starting established connection


# Storage Path on GCS

In [8]:
# GCS configuration
import gcsfs
gcs_bucket_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='anon', access='read_only')

### Quick Note On Zarr

Zarr data is accessed very much like a python dict, but is stored on a file system with keys being folders.

# Study Data

This includes the genome, haplotypes, samples, and variants

In [9]:
storage_path = 'ag1000g-release/phase2.AR1'
gcs_bucket_fs.ls(storage_path)

['ag1000g-release/phase2.AR1/accessibility',
 'ag1000g-release/phase2.AR1/genome',
 'ag1000g-release/phase2.AR1/haplotypes',
 'ag1000g-release/phase2.AR1/samples',
 'ag1000g-release/phase2.AR1/variation']

## Genome Data

In [14]:
storage_path = 'ag1000g-release/phase2.AR1/genome'
walker = gcs_bucket_fs.walk(storage_path)
for walk in walker:
    pprint(walk)

('ag1000g-release/phase2.AR1/genome', ['agamP3', 'agamP4'], [])
('ag1000g-release/phase2.AR1/genome/agamP3',
 [],
 ['Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.dict',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa.fai',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa.flat',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa.gdx',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa.md5',
  'Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.gff3.gz',
  'Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.gff3.gz.md5',
  'Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.txt.gz',
  'Anopheles-gambiae-PEST_REPEATFEATURES_AgamP3.txt.gz.md5',
  'Anopheles-gambiae-PEST_REPEATS.lib',
  'Anopheles-gambiae-PEST_REPEATS.lib.md5'])
('ag1000g-release/phase2.AR1/genome/agamP4',
 [],
 ['Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.fai',
  'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.flat',
  'Anopheles-gambiae-PEST_CHROMOSOM

## Haplotypes

In [20]:
storage_path = 'ag1000g-release/phase2.AR1/haplotypes/'
print(gcs_bucket_fs.ls(storage_path))

# For the sake of brevity we will only include chromosome 2L in this
storage_path = 'ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes/2L'
walker = gcs_bucket_fs.walk(storage_path)
for walk in walker:
    print(walk)

['ag1000g-release/phase2.AR1/haplotypes/main']
('ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes/2L', ['calldata', 'samples', 'variants'], ['.snakemake_timestamp', '.zgroup'])
('ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes/2L/calldata', ['GT'], ['.zgroup'])
('ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes/2L/calldata/GT', [], ['.zarray', '0.0.0', '0.1.0', '0.10.0', '0.11.0', '0.12.0', '0.13.0', '0.14.0', '0.15.0', '0.16.0', '0.17.0', '0.18.0', '0.19.0', '0.2.0', '0.3.0', '0.4.0', '0.5.0', '0.6.0', '0.7.0', '0.8.0', '0.9.0', '1.0.0', '1.1.0', '1.10.0', '1.11.0', '1.12.0', '1.13.0', '1.14.0', '1.15.0', '1.16.0', '1.17.0', '1.18.0', '1.19.0', '1.2.0', '1.3.0', '1.4.0', '1.5.0', '1.6.0', '1.7.0', '1.8.0', '1.9.0', '10.0.0', '10.1.0', '10.10.0', '10.11.0', '10.12.0', '10.13.0', '10.14.0', '10.15.0', '10.16.0', '10.17.0', '10.18.0', '10.19.0', '10.2.0', '10.3.0', '10.4.0', '10.5.0', '10.6.0', '10.7.

# Let's get some variant data

Variant data is grouped by what kind of filtering (if any) has been done. There is all, biallelic, and passed.

In [30]:
#storage_path = 'ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes'
storage_path = 'ag1000g-release/phase2.AR1/variation/main/zarr'
print(gcs_bucket_fs.ls(storage_path))

storage_path = 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass'
print(gcs_bucket_fs.ls(storage_path))


['ag1000g-release/phase2.AR1/variation/main/zarr/all', 'ag1000g-release/phase2.AR1/variation/main/zarr/biallelic', 'ag1000g-release/phase2.AR1/variation/main/zarr/biallelic_snpeff', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass']
['ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/.zgroup', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/.zmetadata', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/2L', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/2R', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/3L', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/3R', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/X', 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass/samples']


In [32]:
storage_path = 'ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass'
store = gcsfs.mapping.GCSMap(storage_path, gcs=gcs_bucket_fs, check=False, create=False)

In [33]:
callset = zarr.Group(store)
callset

<zarr.hierarchy.Group '/'>

In [38]:
list(callset.keys())

['2L', '2R', '3L', '3R', 'X', 'samples']

distributed.scheduler - INFO - Remove worker tcp://10.35.5.2:45685
distributed.core - INFO - Removing comms to tcp://10.35.5.2:45685


When you grab the variant data you might be accessing it as `calldata[chrom]['calldata/GT']` or `calldata[chrom]['calldata/genotype']`

In [34]:
chrom = '3R'
gtz = callset[chrom]['calldata/GT']
gtz

<zarr.core.Array '/3R/calldata/GT' (14481509, 1142, 2) int8>

In [35]:
gt = allel.GenotypeDaskArray(gtz)
gt

Unnamed: 0,0,1,2,3,4,...,1137,1138,1139,1140,1141,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
14481506,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
14481507,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
14481508,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


# Grab Sample MetaData

In [26]:
!wget --no-clobber ftp://ngs.sanger.ac.uk/production/ag1000g/phase2/AR1/samples/samples.meta.txt

--2020-07-19 07:23:54--  ftp://ngs.sanger.ac.uk/production/ag1000g/phase2/AR1/samples/samples.meta.txt
           => ‘samples.meta.txt’
Resolving ngs.sanger.ac.uk (ngs.sanger.ac.uk)... 193.62.203.79
Connecting to ngs.sanger.ac.uk (ngs.sanger.ac.uk)|193.62.203.79|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /production/ag1000g/phase2/AR1/samples ... done.
==> SIZE samples.meta.txt ... 150452
==> PASV ... done.    ==> RETR samples.meta.txt ... done.
Length: 150452 (147K) (unauthoritative)


2020-07-19 07:23:56 (472 KB/s) - ‘samples.meta.txt’ saved [150452]



In [27]:
df_samples = pd.read_csv('samples.meta.txt', sep='\t')
df_samples.head()

Unnamed: 0,ox_code,src_code,population,country,location,site,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,ebi_sample_acc,latitude,longitude
0,AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95033368,30.99,ERS311878,5.60858,-1.54926
1,AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95843804,31.7,ERS311886,5.60858,-1.54926
2,AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,107420666,35.65,ERS311894,4.91217,-1.77397
3,AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,95993752,29.46,ERS311902,4.91217,-1.77397
4,AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,103044262,33.67,ERS311910,4.91217,-1.77397
