In [14]:
import numpy as np
import h5py
import petl as etl
import allel; print('allel', allel.__version__)
%matplotlib inline
import matplotlib.pyplot as plt
import zarr
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import os
if 'DOCKER_IMAGE' in os.environ:
    print('docker image:', os.environ['DOCKER_IMAGE'])

allel 1.2.1


In [2]:
# set up sample metadata
tbl_samples_study = (
    etl
    .fromtsv('/media/sf_Data/Genomes/meta/meta_gambia.txt')
    .convert((3, 4, 5, 6, 7), float)
    .sort('ox_code')
)
tbl_samples_study

Sample_ID,ox_code,src_code,mean_coverage,pc_genome_covered,mapping_rate,mismatch_rate,duplicate_rate,ena_sample_acc,village,PCR,SINE200,Lat,Long,Village_IGS,Village_S200,Posizione,chrom X identification,karyotype 2R,2Rconf,2Rb,2Rc,2Rd,Inv2La,Taxon,Note,month
53624,AG0378-CW,001-1001,33.93,88.06,82.21,2.25,8.39,ERS680029,Tankular,M,na,13°25' N,16°02' W,Tankular_M,Tankular_na,Sud,A,+/+,-----,-,-,-,1,53624,14/06/2015,8
53625,AG0379-CW,001-1011,30.69,88.19,82.0,2.37,3.93,ERS680030,Tankular,M,na,13°25' N,16°02' W,Tankular_M,Tankular_na,Sud,A,b/b,-2---,2,-,-,2,53625,14/06/2015,8
53626,AG0380-CW,001-1032,35.08,88.02,82.33,2.35,9.18,ERS680031,Tankular,M,na,13°25' N,16°02' W,Tankular_M,Tankular_na,Sud,A,b/b,-2---,2,-,-,2,53626,14/06/2015,8
53627,AG0381-CW,001-1046,35.11,89.0,80.33,2.3,3.57,ERS680032,Tankular,M,na,13°25' N,16°02' W,Tankular_M,Tankular_na,Sud,A,b/b,-2---,2,-,-,2,53627,14/06/2015,8
53628,AG0382-CW,001-1048,36.75,89.56,84.65,2.32,2.18,ERS680033,Tankular,M,na,13°25' N,16°02' W,Tankular_M,Tankular_na,Sud,A,b/b,-2---,2,-,-,1,53628,14/06/2015,8


In [3]:
# this is the important bit - exclude samples with poor coverage
samples_study_pass = tbl_samples_study.gt('mean_coverage', 14).gt('pc_genome_covered', 88).values('ox_code').list()
len(samples_study_pass)

68

In [5]:
# set up sample metadata from Ag1000G phase 2 AR1
tbl_samples_ar1 = (
    etl
    .fromtsv('/media/sf_Data/Genomes/meta/samples.meta.txt')
    #.convert(0, int)
)
tbl_samples_ar1

ox_code,src_code,population,country,location,site,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,ebi_sample_acc,latitude,longitude
AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95033368,30.99,ERS311878,5.60858,-1.54926
AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95843804,31.7,ERS311886,5.60858,-1.54926
AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,107420666,35.65,ERS311894,4.91217,-1.77397
AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,95993752,29.46,ERS311902,4.91217,-1.77397
AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,103044262,33.67,ERS311910,4.91217,-1.77397


## Setup genotype data

In [6]:
# pick a chromosome
chrom = '3L'

In [8]:
# setup callset from the study
callset_study = h5py.File(
    '/media/sf_Data/Genomes/variation/gambia_hdf5/1130-AG-GM-CAPUTO.%s.h5' % chrom,
    mode='r'
)
callset_study

<HDF5 file "1130-AG-GM-CAPUTO.3L.h5" (mode r)>

In [11]:
# setup samples from the study
samples_study = [str(s, 'ascii') for s in callset_study[chrom]['samples'][:]]
samples_study[:5]

['AG0378-CW', 'AG0379-CW', 'AG0380-CW', 'AG0381-CW', 'AG0382-CW']

In [9]:
# setup genotypes from the study
genotypes_study = allel.GenotypeChunkedArray(callset_study[chrom]['calldata/genotype'])
genotypes_study

Unnamed: 0,0,1,2,3,4,...,104,105,106,107,108,Unnamed: 12
0,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
2,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
16437132,0/0,./.,./.,0/0,0/0,...,./.,0/0,./.,./.,./.,
16437133,./.,./.,./.,0/0,./.,...,./.,./.,./.,./.,./.,
16437134,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,


In [12]:
# subset to keep only the samples that pass our QC
sidx = [samples_study.index(s) for s in samples_study_pass]
genotypes_study_qc = genotypes_study.take(sidx, axis=1)
genotypes_study_qc

Unnamed: 0,0,1,2,3,4,...,63,64,65,66,67,Unnamed: 12
0,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
2,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
16437132,0/0,./.,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
16437133,./.,./.,./.,0/0,./.,...,./.,./.,./.,./.,./.,
16437134,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,


In [16]:
# setup callset from ag1000g phase 2
callset_ar3= zarr.open('/media/sf_Data/Genomes/variation/ag1000g.phase2.ar1.pass.biallelic/')
callset_ar3.tree()

In [18]:
# setup genotypes from ag1000g phase 2
genotypes_ar3 = allel.GenotypeChunkedArray(callset_ar3[chrom]['calldata/GT'])
genotypes_ar3

Unnamed: 0,0,1,2,3,4,...,1137,1138,1139,1140,1141,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
7897663,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
7897664,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
7897665,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


## SNP ascertainment

In [19]:
# need the FILTER PASS values
variants_ar3 = allel.VariantChunkedTable(callset_ar3[chrom]['variants'],
                                         names=['POS', 'REF', 'ALT', 'FILTER_PASS'])
variants_ar3

Unnamed: 0,POS,REF,ALT,FILTER_PASS,Unnamed: 5
0,9790,b'C',b'T',True,
1,9791,b'G',b'T',True,
2,9798,b'G',b'A',True,
...,...,...,...,...,...
7897663,41956537,b'G',b'A',True,
7897664,41956541,b'C',b'A',True,
7897665,41956551,b'G',b'A',True,


In [20]:
# count alleles in ag1000g
ac_ar3 = genotypes_ar3.count_alleles(max_allele=3)

KeyboardInterrupt: 

In [None]:
# count alleles in study
ac_study = genotypes_study_qc.count_alleles(max_allele=3)

In [None]:
# combine allele counts
ac = allel.AlleleCountsArray(ac_ar3 + ac_study)
ac

In [None]:
# locate biallelic SNPs segregating above some reasonable frequency
loc_bi_seg = (ac.max_allele() <= 1) & (ac[:, :2].min(axis=1) > 7)
loc_bi_seg.size, np.count_nonzero(loc_bi_seg)