In [None]:
!pip install -q \
    zarr==2.6.1 \
    fsspec==0.8.7 \
    gcsfs==0.7.2 \
    dask==2021.03.0 \
    xarray==0.18.0 \
    scikit-allel==1.3.5 \
    bokeh==2.3.2 \
    malariagen_data==0.8.0 \
    plotly==4.14.3

In [None]:
#import libraries
import malariagen_data
import numpy as np
import dask
import dask.array as da
from dask.diagnostics.progress import ProgressBar
# silence some warnings
dask.config.set(**{'array.slicing.split_large_chunks': False})
import allel
import os
import bisect
import hashlib
import json
# quieten dask warnings about large chunks
import pandas as pd
import plotly.express as px
import bokeh.plotting
import bokeh.models
import bokeh.layouts
import bokeh.io
import bokeh.palettes

In [None]:
bokeh.io.output_notebook()

In [None]:
ProgressBar().register()

In [None]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release/", pre=True)

In [None]:
!mkdir -pv ~/vo_agam_release/v3.2/
!gsutil cp gs://vo_agam_release/v3.2/manifest.tsv ~/vo_agam_release/v3.2/

Copying gs://vo_agam_release/v3.2/manifest.tsv...
/ [1 files][  219.0 B/  219.0 B]                                                
Operation completed over 1 objects/219.0 B.                                      


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd ~/vo_agam_release/v3.2/

/root/vo_agam_release/v3.2


In [None]:
!mkdir -pv ~/vo_agam_release/v3.2/metadata/
!gsutil -m rsync -r gs://vo_agam_release/v3.2/metadata/ ~/vo_agam_release/v3.2/metadata/

mkdir: created directory '/root/vo_agam_release/v3.2/metadata/'
Building synchronization state...
Starting synchronization...
Copying gs://vo_agam_release/v3.2/metadata/general/1177-VO-ML-LEHMANN-VMF00015/samples.meta.csv...
Copying gs://vo_agam_release/v3.2/metadata/species_calls_20200422/1237-VO-BJ-DJOGBENOU-VMF00067/samples.species_pca.csv...
Copying gs://vo_agam_release/v3.2/metadata/general/1237-VO-BJ-DJOGBENOU-VMF00067/samples.meta.csv...
Copying gs://vo_agam_release/v3.2/metadata/general/1237-VO-BJ-DJOGBENOU-VMF00050/samples.meta.csv...
Copying gs://vo_agam_release/v3.2/metadata/general/1244-VO-GH-YAWSON-VMF00051/wgs_snp_data.csv...
Copying gs://vo_agam_release/v3.2/metadata/general/1237-VO-BJ-DJOGBENOU-VMF00067/wgs_snp_data.csv...
Copying gs://vo_agam_release/v3.2/metadata/general/1253-VO-TG-DJOGBENOU-VMF00052/samples.meta.csv...
Copying gs://vo_agam_release/v3.2/metadata/species_calls_20200422/1237-VO-BJ-DJOGBENOU-VMF00050/samples.species_pca.csv...
Copying gs://vo_agam_releas

In [None]:
!head ~/vo_agam_release/v3.2/metadata/species_calls_20200422/*/samples.species_aim.csv

==> /root/vo_agam_release/v3.2/metadata/species_calls_20200422/1177-VO-ML-LEHMANN-VMF00015/samples.species_aim.csv <==
sample_id,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii
VBS04793-4651STDY7017929,0.979,0.002,gamb_colu,coluzzii
VBS04794-4651STDY7017930,0.971,0.003,gamb_colu,coluzzii
VBS04796-4651STDY7017932,0.982,0.003,gamb_colu,coluzzii
VBS04797-4651STDY7017933,0.982,0.002,gamb_colu,coluzzii
VBS04798-4651STDY7017934,0.971,0.002,gamb_colu,coluzzii
VBS04799-4651STDY7017935,0.986,0.002,gamb_colu,coluzzii
VBS04801-4651STDY7017937,0.971,0.002,gamb_colu,coluzzii
VBS04802-4651STDY7017938,0.986,0.002,gamb_colu,coluzzii
VBS04803-4651STDY7017939,0.980,0.002,gamb_colu,coluzzii

==> /root/vo_agam_release/v3.2/metadata/species_calls_20200422/1237-VO-BJ-DJOGBENOU-VMF00050/samples.species_aim.csv <==
sample_id,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii
VBS18949-5562STDY7801785,0.975,0.002,gamb_colu,coluzzii
V

In [None]:
!head ~/vo_agam_release/v3.2/metadata/general/*/wgs_snp_data.csv | cut -d, -f1,2

==> /root/vo_agam_release/v3.2/metadata/general/1177-VO-ML-LEHMANN-VMF00015/wgs_snp_data.csv <==
sample_id,alignments_bam
VBS04793-4651STDY7017929,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04793-4651STDY7017929.fixmate.bam
VBS04794-4651STDY7017930,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04794-4651STDY7017930.fixmate.bam
VBS04796-4651STDY7017932,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04796-4651STDY7017932.fixmate.bam
VBS04797-4651STDY7017933,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04797-4651STDY7017933.fixmate.bam
VBS04798-4651STDY7017934,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04798-4651STDY7017934.fixmate.bam
VBS04799-4651STDY7017935,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04799-4651STDY7017935.fixmate.bam
VBS04801-4651STDY7017937,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.ac.uk/VBS04801-4651STDY7017937.fixmate.bam
VBS04802-4651STDY7017938,https://1177-vo-ml-lehmann-vmf00015.cog.sanger.a

In [None]:
!head ~/vo_agam_release/v3.2/metadata/general/*/samples.meta.csv

==> /root/vo_agam_release/v3.2/metadata/general/1177-VO-ML-LEHMANN-VMF00015/samples.meta.csv <==
sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call
VBS04793-4651STDY7017929,MB257 (MB2),Tovi Lehmann,Mali,Markabougou,2013,8,13.914,-6.344,F
VBS04794-4651STDY7017930,TB615,Tovi Lehmann,Mali,Thierola,2014,9,13.659,-7.215,M
VBS04796-4651STDY7017932,TB513 (SB513),Tovi Lehmann,Mali,Siguima,2014,8,14.168,-7.228,M
VBS04797-4651STDY7017933,SB173 (SB3),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F
VBS04798-4651STDY7017934,SB173 (SB4),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F
VBS04799-4651STDY7017935,SB407,Tovi Lehmann,Mali,Siguima,2014,8,14.168,-7.228,F
VBS04801-4651STDY7017937,SB179 (SB7),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F
VBS04802-4651STDY7017938,SB178 (SB5),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F
VBS04803-4651STDY7017939,DB40,Tovi Lehmann,Mali,Dallowere,2015,7,13.616,-7.037,F

==> /root/vo_agam_release/v3.2/metadata/ge

In [None]:
!head ~/vo_agam_release/v3.2/metadata/general/1237-VO-BJ-DJOGBENOU-VMF00050/samples.meta.csv

sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call
VBS18949-5562STDY7801785,WA-1044,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18950-5562STDY7801786,WA-1045,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18951-5562STDY7801787,WA-1046,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18952-5562STDY7801788,WA-1047,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18953-5562STDY7801789,WA-1048,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18954-5562STDY7801790,WA-1049,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18955-5562STDY7801791,WA-1050,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F
VBS18956-5562STDY7801792,WA-1051,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,M
VBS18957-5562STDY7801793,WA-1052,Luc Djogbenou,Benin,Avrankou,2017,9,6.550,2.667,F


In [None]:
!head ~/vo_agam_release/v3.2/metadata/general/1237-VO-BJ-DJOGBENOU-VMF00050/wgs_snp_data.csv | cut -d, -f1,2

sample_id,alignments_bam
VBS18949-5562STDY7801785,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18949-5562STDY7801785.fixmate.bam
VBS18950-5562STDY7801786,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18950-5562STDY7801786.fixmate.bam
VBS18951-5562STDY7801787,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18951-5562STDY7801787.fixmate.bam
VBS18952-5562STDY7801788,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18952-5562STDY7801788.fixmate.bam
VBS18953-5562STDY7801789,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18953-5562STDY7801789.fixmate.bam
VBS18954-5562STDY7801790,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18954-5562STDY7801790.fixmate.bam
VBS18955-5562STDY7801791,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18955-5562STDY7801791.fixmate.bam
VBS18956-5562STDY7801792,https://1237-vo-bj-djogbenou-vmf00050.cog.sanger.ac.uk/VBS18956-5562STDY7801792.fixmate.bam
VBS18957-5562STDY7801793,https://1237-v

In [None]:
# mount Google Drive if running on Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# create a directory to hold PCA results - change this to something suitable
# if not running on Google Colab
results_dir = "drive/MyDrive/Colab Data/ag3-pca-results"
os.makedirs(results_dir, exist_ok=True)

In [None]:
df_sample_sets = ag3.sample_sets(release="v3.2")
df_sample_sets

Unnamed: 0,sample_set,sample_count,release
0,1177-VO-ML-LEHMANN-VMF00015,23,v3.2
1,1237-VO-BJ-DJOGBENOU-VMF00050,90,v3.2
2,1237-VO-BJ-DJOGBENOU-VMF00067,142,v3.2
3,1244-VO-GH-YAWSON-VMF00051,666,v3.2
4,1245-VO-CI-CONSTANT-VMF00054,38,v3.2
5,1253-VO-TG-DJOGBENOU-VMF00052,179,v3.2


In [None]:
df_geneset = ag3.geneset().set_index("ID")
df_geneset

Unnamed: 0_level_0,contig,source,type,start,end,score,strand,phase,Parent,Name,description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2L,2L,VectorBase,chromosome,1,49364325,,,,,,
AGAP004677,2L,VectorBase,gene,157348,186936,,-,,,,methylenetetrahydrofolate dehydrogenase(NAD ) ...
AGAP004677-RA,2L,VectorBase,mRNA,157348,181305,,-,,AGAP004677,,
,2L,VectorBase,three_prime_UTR,157348,157495,,-,,AGAP004677-RA,,
,2L,VectorBase,exon,157348,157623,,-,,AGAP004677-RA,AGAP004677-RB-E4,
...,...,...,...,...,...,...,...,...,...,...,...
,Y_unplaced,VectorBase,five_prime_UTR,47932,48111,,+,,AGAP029375-RA,,
,Y_unplaced,VectorBase,exon,47932,48138,,+,,AGAP029375-RA,AGAP029375-RA-E2,
AGAP029375-PA,Y_unplaced,VectorBase,CDS,48112,48138,,+,0.0,AGAP029375-RA,,
,Y_unplaced,VectorBase,exon,48301,48385,,+,,AGAP029375-RA,AGAP029375-RA-E3,


In [None]:
def plot_genes(contig, width=750, height=150):

    # select the gene rows within the given contig
    df_geneset = ag3.geneset(attributes=["ID", "Name", "Parent", "description"]).set_index("ID")
    data = df_geneset.query(f"type == 'gene' and contig == '{contig}'").copy()

    # plot each gene as a rectangle - add some columns to define rectangle
    # coordinates
    data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
    data['right'] = data['end'] / 1e6  # plot in Mbp coordinates
    data['bottom'] = np.where(data['strand'] == '+', 1, 0)
    data['top'] = data['bottom'] + 0.8

    # tidy up some columns for presentation
    data['Name'].fillna('', inplace=True)
    data['description'].fillna('', inplace=True)

    # determine how long the contig is
    contig_length = len(ag3.genome_sequence(contig))

    # define tooltips for hover
    tooltips = [
        ("ID", '@ID'),
        ("Name", '@Name'),
        ("Description", '@description'),
    ]

    # make a figure
    fig = bokeh.plotting.figure(
        title=f'Genes - {contig}',
        plot_width=width, 
        plot_height=height,
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,tap,hover',
        toolbar_location='above',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips,
    )

    # add functionality to click through to vectorbase
    url = f'https://vectorbase.org/vectorbase/app/record/gene/@ID'
    taptool = fig.select(type=bokeh.models.TapTool)
    taptool.callback = bokeh.models.OpenURL(url=url)

    # now plot the genes as rectangles
    fig.quad(bottom='bottom', top='top', left='left', right='right',
             source=data, line_width=.5, fill_alpha=.5)

    # tidy up the plot
    fig.x_range = bokeh.models.Range1d(0, contig_length/1e6, bounds='auto')
    fig.xaxis.axis_label = f'Position (Mbp)'
    fig.y_range = bokeh.models.Range1d(-.5, 2.3)
    fig.ygrid.visible = False
    yticks = [0.4, 1.4]
    yticklabels = ['rev', 'fwd']
    fig.yaxis.ticker = yticks
    fig.yaxis.major_label_overrides = {k: v for k, v in zip(yticks, yticklabels)}
    fig.yaxis.axis_label = f'Strand'

    # show the plot
    bokeh.plotting.show(fig)

In [None]:
plot_genes("2L")

In [None]:
plot_genes("2R")

In [None]:
plot_genes("3L")

In [None]:
plot_genes("3R")

In [None]:
plot_genes("X")

In [None]:
plot_genes("UNKN")

In [None]:
df_geneset.query("Parent == 'AGAP002862'")

Unnamed: 0_level_0,contig,source,type,start,end,score,strand,phase,Parent,Name,description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AGAP002862-RA,2R,VectorBase,mRNA,28480576,28482637,,-,,AGAP002862,,


In [None]:
def plot_transcript(transcript, width=700, height=120, show=True, x_range=None,
                    toolbar_location='above'):

    # find the gene
    df_geneset = ag3.geneset().set_index("ID")
    parent  = df_geneset.loc[transcript]

    # define tooltips for hover
    tooltips = [
        ("Type", '@type'),
        ("Location", '@contig:@start{,}..@end{,}'),
    ]

    # make a figure
    fig = bokeh.plotting.figure(
        title=f'Transcript - {transcript} ({parent.strand})',
        plot_width=width, 
        plot_height=height,
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        toolbar_location=toolbar_location,
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips,
        x_range=x_range,
    )

    # find child components of the transcript
    data = df_geneset.query(f"Parent == '{transcript}'").copy()
    data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
    data['right'] = data['end'] / 1e6  # plot in Mbp coordinates
    data['bottom'] = -0.4
    data['top'] = 0.4

    # plot exons
    exons = data.query("type == 'exon'")
    fig.quad(bottom='bottom', top='top', left='left', right='right',
             source=exons, fill_color=None, line_color='black', line_width=.5, 
             fill_alpha=0)
    
    # plot introns
    for l, r in zip(exons[:-1]['right'], exons[1:]['left']):
        m = (l + r) / 2
        fig.line([l, m, r], [0, .1, 0], line_width=1, line_color="black")

    # plot UTRs
    fig.quad(bottom='bottom', top='top', left='left', right='right',
                source=data.query("type == 'five_prime_UTR'"), 
                fill_color='green', line_width=0, fill_alpha=.5)
    fig.quad(bottom='bottom', top='top', left='left', right='right',
                source=data.query("type == 'three_prime_UTR'"), 
                fill_color='red', line_width=0, fill_alpha=.5)

    # plot CDSs
    fig.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'CDS'"), 
             fill_color='blue', line_width=0, fill_alpha=.5)

    fig.yaxis.ticker = []
    fig.xaxis.axis_label = f'Position (Mbp)'
    fig.y_range = bokeh.models.Range1d(-.6, .6)

    fig.xaxis.axis_label = f'Contig {parent.contig} position (Mbp)'

    # show the figure
    if show:
        bokeh.plotting.show(fig)

    return fig

In [None]:
plot_transcript("AGAP004050-RA");

In [None]:
plot_transcript("AGAP004050-RB");

In [None]:
df_geneset.query("Parent == 'AGAP001356'")

Unnamed: 0_level_0,contig,source,type,start,end,score,strand,phase,Parent,Name,description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AGAP001356-RA,2R,VectorBase,mRNA,3483099,3497400,,+,,AGAP001356,,


In [None]:
plot_transcript("AGAP006028-RA");

In [None]:
plot_transcript("AGAP006028-RB");

In [None]:
plot_transcript("AGAP006028-RC");

In [None]:
# Rdl gene, first transcript - change this to investigate a different gene or transcript
transcript = "AGAP002862-RA"

# compute effects for all SNPs in chosen transcript
df_effects = ag3.snp_effects(
    transcript=transcript, 
)
df_effects

[########################################] | 100% Completed | 39.5s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.2s
[########################################] | 100% Completed |  0.3s
[########################################] | 100% Completed |  0.6s


Unnamed: 0,contig,position,ref_allele,alt_allele,pass_gamb_colu_arab,pass_gamb_colu,pass_arab,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
0,2R,28480576,G,A,True,True,True,THREE_PRIME_UTR,LOW,,,,,,
1,2R,28480576,G,C,True,True,True,THREE_PRIME_UTR,LOW,,,,,,
2,2R,28480576,G,T,True,True,True,THREE_PRIME_UTR,LOW,,,,,,
3,2R,28480577,C,A,True,True,True,THREE_PRIME_UTR,LOW,,,,,,
4,2R,28480577,C,T,True,True,True,THREE_PRIME_UTR,LOW,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6181,2R,28482636,G,C,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,
6182,2R,28482636,G,T,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,
6183,2R,28482637,A,C,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,
6184,2R,28482637,A,T,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,


In [None]:
df_effects.groupby(['impact', 'effect']).size()

impact    effect               
HIGH      SPLICE_CORE                12
          START_LOST                  3
          STOP_GAINED               145
          STOP_LOST                   8
LOW       FIVE_PRIME_UTR            219
          SYNONYMOUS_CODING        1088
          THREE_PRIME_UTR          1104
MODERATE  NON_SYNONYMOUS_CODING    3310
          SPLICE_REGION              30
MODIFIER  INTRONIC                  267
dtype: int64

In [None]:
df_samples = ag3.sample_metadata(
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"]
)
df_samples.head(100)

In [None]:
df_samples = ag3.sample_metadata(
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"]
)
df_samples.columns

Index(['sample_id', 'partner_sample_id', 'contributor', 'country', 'location',
       'year', 'month', 'latitude', 'longitude', 'sex_call', 'sample_set',
       'release', 'aim_fraction_colu', 'aim_fraction_arab',
       'species_gambcolu_arabiensis', 'species_gambiae_coluzzii', 'species'],
      dtype='object')

In [None]:
df_samples.groupby(['country', 'location', 'year', 'species', 'sample_set' ]).size()

country        location     year  species                        sample_set                   
Benin          Avrankou     2017  coluzzii                       1237-VO-BJ-DJOGBENOU-VMF00050     90
               Djougou      2017  coluzzii                       1237-VO-BJ-DJOGBENOU-VMF00067     78
                                  gambiae                        1237-VO-BJ-DJOGBENOU-VMF00067     63
                                  intermediate_gambiae_coluzzii  1237-VO-BJ-DJOGBENOU-VMF00067      1
Cote d'Ivoire  Aboisso      2017  gambiae                        1245-VO-CI-CONSTANT-VMF00054      37
                                  intermediate_gambiae_coluzzii  1245-VO-CI-CONSTANT-VMF00054       1
Ghana          Korle-Bu     2018  coluzzii                       1244-VO-GH-YAWSON-VMF00051       264
                                  intermediate_gambiae_coluzzii  1244-VO-GH-YAWSON-VMF00051         4
               Madina       2017  gambiae                        1244-VO-GH-YAWSON-VMF000

In [None]:
import io

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df_pheno = pd.read_csv('/content/sample_phenotypes.csv', sep='\t')
df_pheno.head(5)

In [None]:
df_sample_sets = ag3.sample_sets(release="v3.2")
df_sample_sets.head(5)

Unnamed: 0,sample_set,sample_count,release
0,1177-VO-ML-LEHMANN-VMF00015,23,v3.2
1,1237-VO-BJ-DJOGBENOU-VMF00050,90,v3.2
2,1237-VO-BJ-DJOGBENOU-VMF00067,142,v3.2
3,1244-VO-GH-YAWSON-VMF00051,666,v3.2
4,1245-VO-CI-CONSTANT-VMF00054,38,v3.2


In [None]:
df_samples = ag3.sample_metadata(sample_sets="v3.2")
    #sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067"]
#)
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,release,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,species
0,VBS04793-4651STDY7017929,MB257 (MB2),Tovi Lehmann,Mali,Markabougou,2013,8,13.914,-6.344,F,1177-VO-ML-LEHMANN-VMF00015,v3.2,0.979,0.002,gamb_colu,coluzzii,coluzzii
1,VBS04794-4651STDY7017930,TB615,Tovi Lehmann,Mali,Thierola,2014,9,13.659,-7.215,M,1177-VO-ML-LEHMANN-VMF00015,v3.2,0.971,0.003,gamb_colu,coluzzii,coluzzii
2,VBS04796-4651STDY7017932,TB513 (SB513),Tovi Lehmann,Mali,Siguima,2014,8,14.168,-7.228,M,1177-VO-ML-LEHMANN-VMF00015,v3.2,0.982,0.003,gamb_colu,coluzzii,coluzzii
3,VBS04797-4651STDY7017933,SB173 (SB3),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F,1177-VO-ML-LEHMANN-VMF00015,v3.2,0.982,0.002,gamb_colu,coluzzii,coluzzii
4,VBS04798-4651STDY7017934,SB173 (SB4),Tovi Lehmann,Mali,Siguima,2013,8,14.168,-7.228,F,1177-VO-ML-LEHMANN-VMF00015,v3.2,0.971,0.002,gamb_colu,coluzzii,coluzzii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,VBS20222-5568STDY7801477,WA-0872,Luc Djogbenou,Togo,Baguida,2017,12,6.161,1.314,F,1253-VO-TG-DJOGBENOU-VMF00052,v3.2,0.033,0.002,gamb_colu,gambiae,gambiae
1134,VBS20223-5568STDY7801480,WA-0873,Luc Djogbenou,Togo,Baguida,2017,12,6.161,1.314,F,1253-VO-TG-DJOGBENOU-VMF00052,v3.2,0.038,0.002,gamb_colu,gambiae,gambiae
1135,VBS20224-5568STDY7801481,WA-0874,Luc Djogbenou,Togo,Baguida,2017,12,6.161,1.314,F,1253-VO-TG-DJOGBENOU-VMF00052,v3.2,0.074,0.003,gamb_colu,gambiae,gambiae
1136,VBS20225-5568STDY7801482,WA-0875,Luc Djogbenou,Togo,Baguida,2017,12,6.161,1.314,F,1253-VO-TG-DJOGBENOU-VMF00052,v3.2,0.033,0.002,gamb_colu,gambiae,gambiae


In [None]:
df_merged_metadata = pd.merge(df_pheno, df_samples,  how='left', left_on=['specimen'], right_on = ['partner_sample_id'])
df_merged_metadata.head()

In [None]:
cohorts = {
    "gn_madina_2017_gamb": "country == 'Ghana' and location == 'Madina' and year == 2017 and species == 'gambiae'",
    "to_2017_gamb_ba": "country == 'Togo' and year == 2017 and species == 'gambiae'",
    "gn_2017_colu_obu": "country == 'Ghana' and year == 2017 and species == 'gambiae'",
    "gn_2018_colu_kb": "country == 'Ghana' and year == 2018 and species == 'coluzzii'",
    "ci_2017_gamb_abo": "year == 2017 and species == 'gambiae'",
    "be_2017_gamb_avr": "country == 'Benin' and year == 2017 and species == 'coluzzii'",
}

In [None]:
df_af = ag3.snp_allele_frequencies(
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"],
    transcript=transcript, 
    cohorts=cohorts, 
)
df_af

[########################################] | 100% Completed |  0.4s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.7s


Unnamed: 0,contig,position,ref_allele,alt_allele,pass_gamb_colu_arab,pass_gamb_colu,pass_arab,gn_madina_2017_gamb,to_2017_gamb_ba,gn_2017_colu_obu,gn_2018_colu_kb,ci_2017_gamb_abo,be_2017_gamb_avr,max_af
0,2R,28480582,G,T,True,True,True,0.0000,0.000000,0.000000,0.179924,0.000000,0.050595,0.179924
1,2R,28480591,A,C,True,True,True,0.0025,0.000000,0.002513,0.000000,0.001477,0.000000,0.002513
2,2R,28480599,G,A,True,True,True,0.0000,0.002793,0.000000,0.000000,0.000739,0.000000,0.002793
3,2R,28480599,G,T,True,True,True,0.0000,0.000000,0.000000,0.000000,0.003693,0.017857,0.017857
4,2R,28480602,T,C,True,True,True,1.0000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,2R,28482626,C,A,True,True,True,0.0025,0.000000,0.003769,0.000000,0.002216,0.000000,0.003769
389,2R,28482626,C,T,True,True,True,0.0000,0.000000,0.000000,0.000000,0.000739,0.000000,0.000739
390,2R,28482628,A,G,False,True,False,0.3125,0.030726,0.275126,0.000000,0.177253,0.211310,0.312500
391,2R,28482636,G,A,True,True,True,0.0050,0.000000,0.006281,0.000000,0.003693,0.000000,0.006281


In [None]:
df_snps = pd.merge(df_effects, df_af)
df_snps

Unnamed: 0,contig,position,ref_allele,alt_allele,pass_gamb_colu_arab,pass_gamb_colu,pass_arab,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,gn_madina_2017_gamb,to_2017_gamb_ba,gn_2017_colu_obu,gn_2018_colu_kb,ci_2017_gamb_abo,be_2017_gamb_avr,max_af
0,2R,28480582,G,T,True,True,True,THREE_PRIME_UTR,LOW,,,,,,,0.0000,0.000000,0.000000,0.179924,0.000000,0.050595,0.179924
1,2R,28480591,A,C,True,True,True,THREE_PRIME_UTR,LOW,,,,,,,0.0025,0.000000,0.002513,0.000000,0.001477,0.000000,0.002513
2,2R,28480599,G,A,True,True,True,THREE_PRIME_UTR,LOW,,,,,,,0.0000,0.002793,0.000000,0.000000,0.000739,0.000000,0.002793
3,2R,28480599,G,T,True,True,True,THREE_PRIME_UTR,LOW,,,,,,,0.0000,0.000000,0.000000,0.000000,0.003693,0.017857,0.017857
4,2R,28480602,T,C,True,True,True,THREE_PRIME_UTR,LOW,,,,,,,1.0000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,2R,28482626,C,A,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,,0.0025,0.000000,0.003769,0.000000,0.002216,0.000000,0.003769
389,2R,28482626,C,T,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,,0.0000,0.000000,0.000000,0.000000,0.000739,0.000000,0.000739
390,2R,28482628,A,G,False,True,False,FIVE_PRIME_UTR,LOW,,,,,,,0.3125,0.030726,0.275126,0.000000,0.177253,0.211310,0.312500
391,2R,28482636,G,A,True,True,True,FIVE_PRIME_UTR,LOW,,,,,,,0.0050,0.000000,0.006281,0.000000,0.003693,0.000000,0.006281


In [None]:
df_snps.groupby(['impact', 'effect']).size()

impact    effect               
HIGH      STOP_GAINED                2
LOW       FIVE_PRIME_UTR            17
          SYNONYMOUS_CODING        149
          THREE_PRIME_UTR           70
MODERATE  NON_SYNONYMOUS_CODING    119
          SPLICE_REGION              3
MODIFIER  INTRONIC                  33
dtype: int64

In [None]:
df_snps_filtered = df_snps.query("effect == 'NON_SYNONYMOUS_CODING' and max_af > 0.02")
df_snps_filtered

In [None]:
df_snps_filtered.groupby(['impact', 'effect']).size()

impact    effect               
MODERATE  NON_SYNONYMOUS_CODING    24
dtype: int64

In [None]:
df_effects.groupby(['impact', 'effect']).size()

impact    effect               
HIGH      SPLICE_CORE                12
          START_LOST                  3
          STOP_GAINED               145
          STOP_LOST                   8
LOW       FIVE_PRIME_UTR            219
          SYNONYMOUS_CODING        1088
          THREE_PRIME_UTR          1104
MODERATE  NON_SYNONYMOUS_CODING    3310
          SPLICE_REGION              30
MODIFIER  INTRONIC                  267
dtype: int64

In [None]:
def plot_snps(transcript, data, width=750, height=300, palette='Category10'):

    # hover tooltips
    tooltips = [
        ("position", '@contig:@position{,}'),
        ("alleles", '@ref_allele>@alt_allele'),
        ("pass", "@pass_gamb_colu_arab, @pass_gamb_colu, @pass_arab"),
        ("impact", '@impact'),
        ("effect", '@effect'),
        ("aa_change", '@aa_change'),
        ("frequency", '@frequency{%f} (@cohort)'),
    ]

    fig1 = bokeh.plotting.figure(
        title=f'Transcript - {transcript}',
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        plot_width=width, 
        plot_height=height, 
        tooltips=tooltips,
        toolbar_location="above")

    # set up colors
    palette = bokeh.palettes.all_palettes[palette]
    colors = palette[len(cohorts)]

    # plot allele frequencies
    for coh, color in zip(cohorts, colors):
        df = data.copy()
        # add X coordinate in Mbp
        df['x'] = df['position'] / 1e6
        df['frequency'] = df[coh]
        df['cohort'] = coh
        fig1.triangle("x", coh, 
                      size=8, 
                      color=color,
                      source=df,
                      legend_label=coh)

    # tidy up the plot
    fig1.y_range = bokeh.models.Range1d(0, 1)
    fig1.yaxis.axis_label = f'Alt allele frequency'
    fig1.xaxis.visible = False
    fig1.add_layout(fig1.legend[0], 'right')
    fig1.legend.click_policy="hide"

    # plot transcript
    fig2 = plot_transcript(transcript, width=width, height=80, show=False, 
                           x_range=fig1.x_range)
    fig2.toolbar.logo = None 
    fig2.toolbar_location = None
    fig2.title = None

    bokeh.plotting.show(bokeh.layouts.column(fig1, fig2))

In [None]:
plot_snps(transcript, data=df_snps_filtered)

In [None]:
plot_snps(transcript, data=df_snps_filtered)

In [None]:
plot_snps(transcript, data=df_snps_filtered)

In [None]:
def hash_params(*args, **kwargs):
    """Helper function to hash analysis parameters."""
    o = {
        'args': args,
        'kwargs': kwargs
    }
    s = json.dumps(o, sort_keys=True).encode()
    h = hashlib.md5(s).hexdigest()
    return h


def run_pca(
    contig, 
    region_start=None, 
    region_stop=None,
    sample_sets="v3.2",
    sample_query=None,
    site_mask="gamb_colu_arab",
    #site_mask="gamb_colu",
    min_minor_ac=3,
    max_an_missing=0,
    n_snps=100_000,
    snp_offset=0,
    n_components=10):
    """Main function to run a PCA.
    
    Parameters
    ----------
    contig : str
        Chromosome arm, e.g., '3L'.
    region_start : int, optional
        Start position of contig region to use.
    region_stop : int, optional
        Stop position of contig region to use.
    sample_sets : str or list of str, optional
        Sample sets to analyse.
    sample_query : str, optional
        A pandas query string to select specific samples.
    site_mask : {'gamb_colu_arab', 'gamb_colu', 'arab'}
        Which site mask to apply.
    min_minor_ac : int
        Minimum minor allele count.
    max_an_missing : int
        Maximum number of missing allele calls.
    n_snps : int
        Approximate number of SNPs to use.
    snp_offset : int
        Offset when thinning SNPs.
    n_components : int
        Number of PCA components to retain.

    Returns
    -------
    data : pandas DataFrame
        Data frame with one row per sample, including columns "PC1", "PC2", etc.
    evr : numpy array
        Explained variance ratio per principal component.
    
    """
    
    # construct a key to save the results under
    results_key = hash_params(
        contig=contig,
        region_start=region_start, 
        region_stop=region_stop,
        sample_sets=sample_sets,
        sample_query=sample_query,
        site_mask=site_mask,
        min_minor_ac=min_minor_ac,
        max_an_missing=max_an_missing,
        n_snps=n_snps,
        snp_offset=snp_offset,
        n_components=n_components
    )

    # define paths for results files
    data_path = f'{results_dir}/{results_key}-data.csv'
    evr_path = f'{results_dir}/{results_key}-evr.npy'

    try:
        # try to load previously generated results
        data = pd.read_csv(data_path)
        evr = np.load(evr_path)
        return data, evr
    except FileNotFoundError:
        # no previous results available, need to run analysis
        print(f'running analysis: {results_key}')
    
    print('setting up inputs')

    # load sample metadata
    df_samples = ag3.sample_metadata(sample_sets=sample_sets)
    #df_samples = df_merged_metadata
    # access SNP genotypes
    gt = ag3.snp_genotypes(contig=contig, sample_sets=sample_sets, site_mask=site_mask)

    if region_start or region_stop:
        # locate region within contig
        pos = ag3.snp_sites(contig=contig, field='POS', site_mask=site_mask).compute()
        loc_region = slice(
            bisect.bisect_left(pos, region_start) if region_start else None,
            bisect.bisect_right(pos, region_stop) if region_stop else None,
        )
        gt = gt[loc_region]
    
    if sample_query:
        # locate selected samples
        loc_samples = df_samples.eval(sample_query).values
        df_samples = df_samples.loc[loc_samples, :]
        gt = da.compress(loc_samples, gt, axis=1)
        
    print('locating segregating sites within desired frequency range')

    # perform allele count
    ac = allel.GenotypeDaskArray(gt).count_alleles(max_allele=3).compute()
    
    # calculate some convenience variables
    n_chroms = gt.shape[1] * 2
    an_called = ac.sum(axis=1)
    an_missing = n_chroms - an_called
    min_ref_ac = min_minor_ac
    max_ref_ac = n_chroms - min_minor_ac

    # here we choose biallelic sites involving the reference allele
    loc_seg = np.nonzero(ac.is_biallelic() & 
                         (ac[:, 0] >= min_ref_ac) & 
                         (ac[:, 0] <= max_ref_ac) & 
                         (an_missing <= max_an_missing))[0]
    
    print('preparing PCA input data')

    # thin SNPs to approximately the desired number
    snp_step = loc_seg.shape[0] // n_snps
    loc_seg_ds = loc_seg[snp_offset::snp_step]

    # subset genotypes to selected sites
    gt_seg = da.take(gt, loc_seg_ds, axis=0)
    
    # convert to genotype alt counts
    gn_seg = allel.GenotypeDaskArray(gt_seg).to_n_alt().compute()
    
    # remove any edge-case variants where all genotypes are identical
    loc_var = np.any(gn_seg != gn_seg[:, 0, np.newaxis], axis=1)
    gn_var = np.compress(loc_var, gn_seg, axis=0)

    print('running PCA')

    # run the PCA
    coords, model = allel.pca(gn_var, n_components=n_components)
    
    # add PCs to dataframe
    data = df_samples.copy()
    for i in range(n_components):
        data[f'PC{i+1}'] = coords[:, i]
    
    # save results
    evr = model.explained_variance_ratio_
    data.to_csv(data_path, index=False)
    np.save(evr_path, evr)
    print(f'saved results: {results_key}')
    
    return data, evr
    

In [None]:
def plot_variance(evr, **kwargs):
    """Plot a bar chart showing variance explained by each principal
    component."""
    
    # prepare variables
    y = evr * 100
    x = [str(i+1) for i in range(len(y))]
    
    # setup plotting options
    plot_kwargs = dict(
        labels={
            'x': 'Principal component',
            'y': 'Explained variance (%)',
        },
        template='simple_white',
        width=600,
        height=400
    )
    # apply any user overrides
    plot_kwargs.update(kwargs)

    # make a bar plot
    fig = px.bar(x=x, y=y, **plot_kwargs)
    fig.show()
    

In [None]:
def jitter(a, f):
    r = a.max() - a.min()
    return a + f * np.random.uniform(-r, r, a.shape)


def plot_coords(
    data,
    x='PC1',
    y='PC2',
    jitter_frac=0.02,
    random_seed=42,
    **kwargs,
    ):

    # setup data
    data = data.copy()
    
    # apply jitter if desired - helps spread out points when tightly clustered
    if jitter_frac:
        np.random.seed(random_seed)
        data[x] = jitter(data[x], jitter_frac)
        data[y] = jitter(data[y], jitter_frac)
            
    # convenience variables
    data['country_location'] = data['country'] + ' - ' + data['location']
    data['size'] = 1  # hack to allow us to control marker size
    
    # setup plotting options
    plot_kwargs = dict(
        width=700,
        height=500,
        template='simple_white',
        hover_name='sample_id',
        hover_data=[
            'partner_sample_id',
            'sample_set',
            'species', 
            'country', 
            'location',
            # 'insecticide',
            # 'phenotype',
            # 'exposure_time',
            # 'concentration',
            'year', 
        ],
        size='size',
        size_max=8,
        opacity=0.9,
        render_mode='svg',
    )
    # apply any user overrides
    plot_kwargs.update(kwargs)

    # 2D scatter plot
    fig = px.scatter(data, x=x, y=y, **plot_kwargs)
    fig.show()


def plot_coords_3d(
    data,
    x='PC1',
    y='PC2',
    z='PC3',
    jitter_frac=0.02,
    random_seed=42,
    **kwargs,
    ):

    # setup data
    data = data.copy()
    
    # apply jitter if desired - helps spread out points when tightly clustered
    if jitter_frac:
        np.random.seed(random_seed)
        data[x] = jitter(data[x], jitter_frac)
        data[y] = jitter(data[y], jitter_frac)
        data[z] = jitter(data[z], jitter_frac)
            
    # convenience variables
    data['country_location'] = data['country'] + ' - ' + data['location']
    #data['country_location'] = data['country_y'] + ' - ' + data['location_y']
       
    # setup plotting options
    plot_kwargs = dict(
        width=700,
        height=500,
        hover_name='sample_id',
        hover_data=[
            'partner_sample_id',
            'sample_set',
            'species', 
            'country', 
            'location',
            # 'insecticide',
            # 'phenotype',
            # 'exposure_time',
            # 'concentration',
            'year', 
        ],
    )
    # apply any user overrides
    plot_kwargs.update(kwargs)

    # 3D scatter plot
    fig = px.scatter_3d(data, x=x, y=y, z=z, **plot_kwargs)
    fig.show()

In [None]:
# choose colours for species
species_palette = px.colors.qualitative.Plotly
species_color_map = {
    'gambiae': species_palette[0],
    'coluzzii': species_palette[1],
    'arabiensis': species_palette[2],
    'intermediate_gambiae_coluzzii': species_palette[3],
    'intermediate_arabiensis_gambiae': species_palette[4],
}

In [None]:
data, evr = run_pca(
    contig='2R', 
    sample_sets="1244-VO-GH-YAWSON-VMF00051",
)

In [None]:
evr

In [None]:
title = '1244-VO-GH-YAWSON-VMF00051 (2L)'
data, evr = run_pca(
    contig='2L', 
    sample_sets="1244-VO-GH-YAWSON-VMF00051",
    )
plot_variance(evr, title=title)

In [None]:
plot_coords(data, x='PC1', y='PC2',
            color='location'
            title=title)

In [None]:
title = '1244-VO-GH-YAWSON-VMF00051 (X)'
data, evr = run_pca(
    contig='X', 
    sample_sets="1244-VO-GH-YAWSON-VMF00051",
    #sample_query=(
        #"species == 'gambiae' and "
        #"location in ['Madina', 'Obuasi']"
        #"location in ['Madina', 'Obuasi', 'Korle-Bu']"
    #)
)
plot_variance(evr, title=title)

In [None]:
plot_coords(data, x='PC1', y='PC2',
            color='location', 
            #color_discrete_map=species_color_map, 
            title=title)

In [None]:
title = '1244-VO-GH-YAWSON-VMF00051 (X)'
data, evr = run_pca(
    contig='X', 
    sample_sets="1244-VO-GH-YAWSON-VMF00051",
    sample_query=(
        "species == 'gambiae' and "
        "location in ['Madina', 'Obuasi']"
        #location in ['Madina', 'Obuasi', 'Korle-Bu']"
    )
)
plot_variance(evr, title=title)

In [None]:
plot_coords_3d(data, x='PC1', y='PC2', z='PC3', 
               color='location', 
               #color_discrete_map=species_color_map, 
               title=title,
               jitter_frac=0.05)

In [None]:
title = '1244-VO-GH-YAWSON-VMF00051 (X)'
plot_coords(data, x='PC1', y='PC2',
            color='location',
            #color_discrete_map=species_color_map, 
            title=title)

In [None]:
plot_variance(evr, title=title)

In [None]:
title = '1244-VO-GH-YAWSON-VMF00051  (3L)'
data, evr = run_pca(
    contig='3L', 
    sample_sets="1244-VO-GH-YAWSON-VMF00051",
)
plot_variance(evr, title=title)
plot_coords(data, x='PC1', y='PC2',
            color='location', 
            #color_discrete_map=species_color_map, 
            title=title)

In [None]:
df_samples = ag3.sample_metadata(
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067"]
)
df_samples.head(100)

In [None]:
df_samples = ag3.sample_metadata(
    sample_sets=["1177-VO-ML-LEHMANN-VMF00015", "1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"]
)
df_samples.head()

In [None]:
df_samples.groupby(['country', 'location', 'year', 'species', 'sample_set' ]).size()

In [None]:
# use the scikit-allel wrapper class for genotype calls
gt = ag3.snp_genotypes(contig="3L", sample_sets=["1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052", "1237-VO-BJ-DJOGBENOU-VMF00050"])
gt = allel.GenotypeDaskArray(gt)
gt

[########################################] | 100% Completed |  2.2s
[########################################] | 100% Completed |  1.8s


Unnamed: 0,0,1,2,3,4,...,968,969,970,971,972,Unnamed: 12
0,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
2,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
40758470,./.,./.,./.,./.,./.,...,0/0,./.,./.,./.,./.,
40758471,./.,./.,./.,./.,./.,...,0/0,./.,./.,./.,./.,
40758472,./.,./.,./.,./.,./.,...,0/0,./.,./.,./.,./.,


In [None]:
ds_snps = ag3.snp_calls(contig="3L", sample_sets="v3.2")
ds_snps

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,134.22 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 163.03 MB 134.22 MB Shape (40758473,) (33554432,) Count 2 Tasks 2 Chunks Type int32 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,134.22 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 40.76 MB 33.55 MB Shape (40758473,) (33554432,) Count 2 Tasks 2 Chunks Type uint8 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,27.31 kB,15.98 kB
Shape,"(1138,)","(666,)"
Count,12 Tasks,6 Chunks
Type,|S24,numpy.ndarray
"Array Chunk Bytes 27.31 kB 15.98 kB Shape (1138,) (666,) Count 12 Tasks 6 Chunks Type |S24 numpy.ndarray",1138  1,

Unnamed: 0,Array,Chunk
Bytes,27.31 kB,15.98 kB
Shape,"(1138,)","(666,)"
Count,12 Tasks,6 Chunks
Type,|S24,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,122.28 MB
Shape,"(40758473, 4)","(40758473, 3)"
Count,5 Tasks,2 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 163.03 MB 122.28 MB Shape (40758473, 4) (40758473, 3) Count 5 Tasks 2 Chunks Type |S1 numpy.ndarray",4  40758473,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,122.28 MB
Shape,"(40758473, 4)","(40758473, 3)"
Count,5 Tasks,2 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,3978 Tasks,1232 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.77 GB 120.00 MB Shape (40758473, 1138, 2) (600000, 100, 2) Count 3978 Tasks 1232 Chunks Type int8 numpy.ndarray",2  1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,3978 Tasks,1232 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,46.38 GB,85.20 MB
Shape,"(40758473, 1138)","(600000, 142)"
Count,4192 Tasks,1274 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 46.38 GB 85.20 MB Shape (40758473, 1138) (600000, 142) Count 4192 Tasks 1274 Chunks Type int8 numpy.ndarray",1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,46.38 GB,85.20 MB
Shape,"(40758473, 1138)","(600000, 142)"
Count,4192 Tasks,1274 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,185.53 GB,60.00 MB
Shape,"(40758473, 1138)","(300000, 50)"
Count,7174 Tasks,3400 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 185.53 GB 60.00 MB Shape (40758473, 1138) (300000, 50) Count 7174 Tasks 3400 Chunks Type float32 numpy.ndarray",1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,185.53 GB,60.00 MB
Shape,"(40758473, 1138)","(300000, 50)"
Count,7174 Tasks,3400 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,371.07 GB,120.00 MB
Shape,"(40758473, 1138, 4)","(300000, 50, 4)"
Count,7004 Tasks,3400 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 371.07 GB 120.00 MB Shape (40758473, 1138, 4) (300000, 50, 4) Count 7004 Tasks 3400 Chunks Type int16 numpy.ndarray",4  1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,371.07 GB,120.00 MB
Shape,"(40758473, 1138, 4)","(300000, 50, 4)"
Count,7004 Tasks,3400 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,5210 Tasks,1232 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.77 GB 120.00 MB Shape (40758473, 1138, 2) (600000, 100, 2) Count 5210 Tasks 1232 Chunks Type bool numpy.ndarray",2  1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,5210 Tasks,1232 Chunks
Type,bool,numpy.ndarray


In [None]:
ds_snps = ag3.snp_calls(contig="3L", sample_sets=["1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052", "1237-VO-BJ-DJOGBENOU-VMF00050"])
ds_snps

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,134.22 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 163.03 MB 134.22 MB Shape (40758473,) (33554432,) Count 2 Tasks 2 Chunks Type int32 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,134.22 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 40.76 MB 33.55 MB Shape (40758473,) (33554432,) Count 2 Tasks 2 Chunks Type uint8 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,23.35 kB,15.98 kB
Shape,"(973,)","(666,)"
Count,8 Tasks,4 Chunks
Type,|S24,numpy.ndarray
"Array Chunk Bytes 23.35 kB 15.98 kB Shape (973,) (666,) Count 8 Tasks 4 Chunks Type |S24 numpy.ndarray",973  1,

Unnamed: 0,Array,Chunk
Bytes,23.35 kB,15.98 kB
Shape,"(973,)","(666,)"
Count,8 Tasks,4 Chunks
Type,|S24,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,122.28 MB
Shape,"(40758473, 4)","(40758473, 3)"
Count,5 Tasks,2 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 163.03 MB 122.28 MB Shape (40758473, 4) (40758473, 3) Count 5 Tasks 2 Chunks Type |S1 numpy.ndarray",4  40758473,

Unnamed: 0,Array,Chunk
Bytes,163.03 MB,122.28 MB
Shape,"(40758473, 4)","(40758473, 3)"
Count,5 Tasks,2 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 40.76 MB 40.76 MB Shape (40758473,) (40758473,) Count 1 Tasks 1 Chunks Type bool numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,40.76 MB
Shape,"(40758473,)","(40758473,)"
Count,1 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,79.32 GB,120.00 MB
Shape,"(40758473, 973, 2)","(600000, 100, 2)"
Count,2873 Tasks,902 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 79.32 GB 120.00 MB Shape (40758473, 973, 2) (600000, 100, 2) Count 2873 Tasks 902 Chunks Type int8 numpy.ndarray",2  973  40758473,

Unnamed: 0,Array,Chunk
Bytes,79.32 GB,120.00 MB
Shape,"(40758473, 973, 2)","(600000, 100, 2)"
Count,2873 Tasks,902 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.66 GB,60.00 MB
Shape,"(40758473, 973)","(600000, 100)"
Count,2543 Tasks,814 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 39.66 GB 60.00 MB Shape (40758473, 973) (600000, 100) Count 2543 Tasks 814 Chunks Type int8 numpy.ndarray",973  40758473,

Unnamed: 0,Array,Chunk
Bytes,39.66 GB,60.00 MB
Shape,"(40758473, 973)","(600000, 100)"
Count,2543 Tasks,814 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.63 GB,60.00 MB
Shape,"(40758473, 973)","(300000, 50)"
Count,5916 Tasks,2856 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 158.63 GB 60.00 MB Shape (40758473, 973) (300000, 50) Count 5916 Tasks 2856 Chunks Type float32 numpy.ndarray",973  40758473,

Unnamed: 0,Array,Chunk
Bytes,158.63 GB,60.00 MB
Shape,"(40758473, 973)","(300000, 50)"
Count,5916 Tasks,2856 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,317.26 GB,120.00 MB
Shape,"(40758473, 973, 4)","(300000, 50, 4)"
Count,5712 Tasks,2856 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 317.26 GB 120.00 MB Shape (40758473, 973, 4) (300000, 50, 4) Count 5712 Tasks 2856 Chunks Type int16 numpy.ndarray",4  973  40758473,

Unnamed: 0,Array,Chunk
Bytes,317.26 GB,120.00 MB
Shape,"(40758473, 973, 4)","(300000, 50, 4)"
Count,5712 Tasks,2856 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,79.32 GB,120.00 MB
Shape,"(40758473, 973, 2)","(600000, 100, 2)"
Count,3775 Tasks,902 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 79.32 GB 120.00 MB Shape (40758473, 973, 2) (600000, 100, 2) Count 3775 Tasks 902 Chunks Type bool numpy.ndarray",2  973  40758473,

Unnamed: 0,Array,Chunk
Bytes,79.32 GB,120.00 MB
Shape,"(40758473, 973, 2)","(600000, 100, 2)"
Count,3775 Tasks,902 Chunks
Type,bool,numpy.ndarray


In [None]:
ds_snps = ag3.site_annotations(contig='3L', field='codon_nonsyn')
ds_snps

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,10 Tasks,2 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 40.76 MB 33.55 MB Shape (40758473,) (33554432,) Count 10 Tasks 2 Chunks Type uint8 numpy.ndarray",40758473  1,

Unnamed: 0,Array,Chunk
Bytes,40.76 MB,33.55 MB
Shape,"(40758473,)","(33554432,)"
Count,10 Tasks,2 Chunks
Type,uint8,numpy.ndarray


In [None]:
# choose chromosome arm
contig = "3L"

# choose site filter mask
mask = "gamb_colu_arab"

# choose sample sets
sample_sets = ["1244-VO-GH-YAWSON-VMF00051"]

# locate pass sites
loc_pass = ag3.site_filters(contig=contig, mask=mask).compute()

# perform an allele count over genotypes
gt = ag3.snp_genotypes(contig=contig, sample_sets=sample_sets)
gt = allel.GenotypeDaskArray(gt)
ac = gt.count_alleles(max_allele=3)

# locate segregating sites
loc_seg = ac.is_segregating()

# count segregating and pass sites
n_pass_seg = da.count_nonzero(loc_pass & loc_seg)

# run the computation
with ProgressBar():
    n_pass_seg = n_pass_seg.compute()

n_pass_seg

[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  1min 28.2s
[########################################] | 100% Completed |  1min 28.3s


8329950

In [None]:
gt = ag3.snp_genotypes(
    contig="3R", 
    sample_sets=["1177-VO-ML-LEHMANN-VMF00015", "1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"]
)
gt

Unnamed: 0,Array,Chunk
Bytes,118.87 GB,120.00 MB
Shape,"(52226568, 1138, 2)","(600000, 100, 2)"
Count,5101 Tasks,1582 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 118.87 GB 120.00 MB Shape (52226568, 1138, 2) (600000, 100, 2) Count 5101 Tasks 1582 Chunks Type int8 numpy.ndarray",2  1138  52226568,

Unnamed: 0,Array,Chunk
Bytes,118.87 GB,120.00 MB
Shape,"(52226568, 1138, 2)","(600000, 100, 2)"
Count,5101 Tasks,1582 Chunks
Type,int8,numpy.ndarray


In [None]:
gt = ag3.snp_genotypes(
    contig="3L", 
    sample_sets=["1177-VO-ML-LEHMANN-VMF00015", "1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067", "1244-VO-GH-YAWSON-VMF00051", "1245-VO-CI-CONSTANT-VMF00054", "1253-VO-TG-DJOGBENOU-VMF00052"]
)
gt

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,3978 Tasks,1232 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.77 GB 120.00 MB Shape (40758473, 1138, 2) (600000, 100, 2) Count 3978 Tasks 1232 Chunks Type int8 numpy.ndarray",2  1138  40758473,

Unnamed: 0,Array,Chunk
Bytes,92.77 GB,120.00 MB
Shape,"(40758473, 1138, 2)","(600000, 100, 2)"
Count,3978 Tasks,1232 Chunks
Type,int8,numpy.ndarray


In [None]:
df_samples.columns

Index(['sample_id', 'partner_sample_id', 'contributor', 'country', 'location',
       'year', 'month', 'latitude', 'longitude', 'sex_call', 'sample_set',
       'release', 'aim_fraction_colu', 'aim_fraction_arab',
       'species_gambcolu_arabiensis', 'species_gambiae_coluzzii', 'species'],
      dtype='object')

In [None]:
gt = ag3.snp_genotypes(
    contig="X", 
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067"]
)
gt

Unnamed: 0,Array,Chunk
Bytes,10.85 GB,120.00 MB
Shape,"(23385349, 232, 2)","(600000, 100, 2)"
Count,234 Tasks,117 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 10.85 GB 120.00 MB Shape (23385349, 232, 2) (600000, 100, 2) Count 234 Tasks 117 Chunks Type int8 numpy.ndarray",2  232  23385349,

Unnamed: 0,Array,Chunk
Bytes,10.85 GB,120.00 MB
Shape,"(23385349, 232, 2)","(600000, 100, 2)"
Count,234 Tasks,117 Chunks
Type,int8,numpy.ndarray


In [None]:
gt = ag3.snp_genotypes(
    contig="2L", 
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067"]
)
gt

Unnamed: 0,Array,Chunk
Bytes,22.52 GB,120.00 MB
Shape,"(48525747, 232, 2)","(600000, 100, 2)"
Count,486 Tasks,243 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 22.52 GB 120.00 MB Shape (48525747, 232, 2) (600000, 100, 2) Count 486 Tasks 243 Chunks Type int8 numpy.ndarray",2  232  48525747,

Unnamed: 0,Array,Chunk
Bytes,22.52 GB,120.00 MB
Shape,"(48525747, 232, 2)","(600000, 100, 2)"
Count,486 Tasks,243 Chunks
Type,int8,numpy.ndarray


In [None]:
gt = ag3.snp_genotypes(
    contig="2R", 
    sample_sets=["1237-VO-BJ-DJOGBENOU-VMF00050", "1237-VO-BJ-DJOGBENOU-VMF00067"]
)
gt

Unnamed: 0,Array,Chunk
Bytes,27.90 GB,120.00 MB
Shape,"(60132453, 232, 2)","(600000, 100, 2)"
Count,606 Tasks,303 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 27.90 GB 120.00 MB Shape (60132453, 232, 2) (600000, 100, 2) Count 606 Tasks 303 Chunks Type int8 numpy.ndarray",2  232  60132453,

Unnamed: 0,Array,Chunk
Bytes,27.90 GB,120.00 MB
Shape,"(60132453, 232, 2)","(600000, 100, 2)"
Count,606 Tasks,303 Chunks
Type,int8,numpy.ndarray


In [None]:
len(df_samples) == gt.shape[1]

False

In [None]:
g = gt[:5, :3, :].compute()
g

[########################################] | 100% Completed |  0.6s


array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [None]:
# use the scikit-allel wrapper class for genotype calls
gtw = allel.GenotypeDaskArray(gt)
gtw

[########################################] | 100% Completed |  1.5s
[########################################] | 100% Completed |  1.2s


Unnamed: 0,0,1,2,3,4,...,227,228,229,230,231,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
60132450,0/0,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
60132451,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
60132452,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,


In [None]:
pos, ref, alt = ag3.snp_sites(contig="3R")
pos

Unnamed: 0,Array,Chunk
Bytes,208.91 MB,134.22 MB
Shape,"(52226568,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 208.91 MB 134.22 MB Shape (52226568,) (33554432,) Count 2 Tasks 2 Chunks Type int32 numpy.ndarray",52226568  1,

Unnamed: 0,Array,Chunk
Bytes,208.91 MB,134.22 MB
Shape,"(52226568,)","(33554432,)"
Count,2 Tasks,2 Chunks
Type,int32,numpy.ndarray


In [None]:
ref

Unnamed: 0,Array,Chunk
Bytes,52.23 MB,52.23 MB
Shape,"(52226568,)","(52226568,)"
Count,1 Tasks,1 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 52.23 MB 52.23 MB Shape (52226568,) (52226568,) Count 1 Tasks 1 Chunks Type |S1 numpy.ndarray",52226568  1,

Unnamed: 0,Array,Chunk
Bytes,52.23 MB,52.23 MB
Shape,"(52226568,)","(52226568,)"
Count,1 Tasks,1 Chunks
Type,|S1,numpy.ndarray


In [None]:
alt

Unnamed: 0,Array,Chunk
Bytes,156.68 MB,133.69 MB
Shape,"(52226568, 3)","(44564480, 3)"
Count,2 Tasks,2 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 156.68 MB 133.69 MB Shape (52226568, 3) (44564480, 3) Count 2 Tasks 2 Chunks Type |S1 numpy.ndarray",3  52226568,

Unnamed: 0,Array,Chunk
Bytes,156.68 MB,133.69 MB
Shape,"(52226568, 3)","(44564480, 3)"
Count,2 Tasks,2 Chunks
Type,|S1,numpy.ndarray


In [None]:
# read first 10 SNP positions into a numpy array
p = pos[:10].compute()
p

[########################################] | 100% Completed |  0.7s


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int32)

In [None]:
# read first 10 SNP reference alleles
r = ref[:10].compute()
r

[########################################] | 100% Completed |  0.6s


array([b'C', b'C', b'T', b'C', b'T', b'A', b'C', b'G', b'T', b'T'],
      dtype='|S1')

In [None]:
# read first 10 SNP alternate alleles
a = alt[:10].compute()
a

[########################################] | 100% Completed |  0.7s


array([[b'A', b'T', b'G'],
       [b'A', b'T', b'G'],
       [b'A', b'C', b'G'],
       [b'A', b'T', b'G'],
       [b'A', b'C', b'G'],
       [b'C', b'T', b'G'],
       [b'A', b'T', b'G'],
       [b'A', b'C', b'T'],
       [b'A', b'C', b'G'],
       [b'A', b'C', b'G']], dtype='|S1')

In [None]:
gt = ag3.snp_genotypes(contig="3R", sample_sets="v3.2")
gt

Unnamed: 0,Array,Chunk
Bytes,118.87 GB,120.00 MB
Shape,"(52226568, 1138, 2)","(600000, 100, 2)"
Count,5101 Tasks,1582 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 118.87 GB 120.00 MB Shape (52226568, 1138, 2) (600000, 100, 2) Count 5101 Tasks 1582 Chunks Type int8 numpy.ndarray",2  1138  52226568,

Unnamed: 0,Array,Chunk
Bytes,118.87 GB,120.00 MB
Shape,"(52226568, 1138, 2)","(600000, 100, 2)"
Count,5101 Tasks,1582 Chunks
Type,int8,numpy.ndarray
