In [6]:
%run imports_20150407.ipynb ##run alistair modules
%run utils.ipynb

In [7]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import h5py
import petl as etl
import petlx.bio
import dask
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import pyfasta
import random
import itertools
import csv
import getpass
import cython
import sys
import gc
import datetime
import humanize
from humanize import naturalsize, intcomma, intword


def log(*msg):
    print(' '.join(map(str, msg)), file=sys.stdout)
    sys.stdout.flush()
    
from contextlib import contextmanager

@contextmanager
def timer(*msg):
    before = datetime.datetime.now()
    try:
        yield
    except:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'errored after %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stderr)
        sys.stderr.flush()   
        raise
    else:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'done in %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stdout)
        sys.stdout.flush()

In [6]:
#gcs_orig = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cache')
#gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token=gcs_orig.session.credentials)
#gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud')

In [None]:
metadata = pd.read_csv("samples.meta.txt", sep="\t")

In [3]:
fasta_fn = '/home/jovyan/notebooks/data/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
genome = pyfasta.Fasta(fasta_fn)

In [None]:
geneset_fn = '/home/jovyan/notebooks/data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.sorted.gff3.gz'

In [None]:
populations = ['AOcol', 'BFcol', 'CIcol', 'GHcol', 'GNcol','GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam','KE', 'GM', 'GW']

------------------------------

In [11]:
#Upgraded version of datalab:

gcs_bucket_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='anon', access='read_only')

In [12]:
#Adding phase1 genotype path ######
  
geno_path_p1 = os.path.join("ag1000g-release/phase1.AR3/variation/main/zarr/ag1000g.phase1.ar3.pass")
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_path_p1)
callset_phase1= zarr.Group(gcsacmap, read_only=True)

In [None]:
#Adding phase2 genotype path ######
  
geno_path_p2 = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass")
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_path_p2)
callset_phase2= zarr.Group(gcsacmap, read_only=True)

In [13]:
 #Adding phase2 biallelic genotype path ######
  
geno_bi_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_bi_path)
callset_biallel= zarr.Group(gcsacmap, read_only=True)

In [14]:
#Adding hap genotype path ######
  
hap_path = os.path.join('ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes')
gcsacmap = gcs_bucket_fs.get_mapper(root=hap_path)
callset_hap_phase2= zarr.Group(gcsacmap, read_only=True)

In [15]:
 #Adding accessibility genotype path ######
  
accessibility_path = os.path.join("ag1000g-release/phase2.AR1/accessibility/accessibility.zarr")
gcsacmap = gcs_bucket_fs.get_mapper(root=accessibility_path)
accessibility= zarr.Group(gcsacmap, read_only=True)

In [16]:
#Adding annotation data genotype path ######
  
anno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic_snpeff/ag1000g.phase2.ar1.pass.biallelic_snpeff")
gcsacmap = gcs_bucket_fs.get_mapper(root=anno_path)
callset_anno= zarr.Group(gcsacmap, read_only=True)

--------------------------------------

In [3]:
palette = sns.color_palette()

-----------------------------------------------

Old version datalab:

gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud') ## cloud connection function

geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass")  ## Adding phase2 genotype path 
gcsmap_p2 = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset_phase2= zarr.Group(gcsmap_p2, read_only=True) ## read data

geno_bi_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")  ## Adding phase2 biallelic genotype path 
gcsmap_bi = gcsfs.mapping.GCSMap(geno_bi_path, gcs=gcs) ## link callset
callset_biallel= zarr.Group(gcsmap_bi, read_only=True) ## read data

geno_p1_path = os.path.join("ag1000g-release/phase1.AR3/variation/main/zarr/ag1000g.phase1.ar3.pass")  ## Adding phase1 genotype path 
gcsmap_p1 = gcsfs.mapping.GCSMap(geno_p1_path, gcs=gcs) ## link callset
callset_phase1= zarr.Group(gcsmap_p1, read_only=True) ## read data

 ###### Adding phase2 haplotype path ######
hap_path = 'ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes'
gcsmap_hap = gcsfs.mapping.GCSMap(hap_path, gcs=gcs) ## link callset
calldata_hap_phase2= zarr.Group(gcsmap_hap, read_only=True)

 ###### Adding phase2 accessibility path ######
accessibility_path = ("ag1000g-release/phase2.AR1/accessibility/accessibility.zarr")
gcsacmap_access = gcsfs.mapping.GCSMap(accessibility_path, gcs=gcs) ## link callset
accessibility= zarr.Group(gcsacmap_access, read_only=True)

anno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic_snpeff/ag1000g.phase2.ar1.pass.biallelic_snpeff")  ## Adding phase2 genotype path 
gcsmap_anno = gcsfs.mapping.GCSMap(anno_path, gcs=gcs) ## link callset
callset_anno= zarr.Group(gcsmap_anno, read_only=True) ## read data