In [None]:
%run "08222019_functions.ipynb"

In [2]:
genotypes_phase2_call = calldata_hap_phase2['3L']["calldata/GT"]
genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call, loc.start, loc.stop)

In [3]:
genotypes_phase2

Unnamed: 0,0,1,2,3,4,...,1159,1160,1161,1162,1163,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
10752698,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
10752699,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
10752700,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


In [6]:
import sys
import gc
import datetime
import humanize
from humanize import naturalsize, intcomma, intword


def log(*msg):
    print(' '.join(map(str, msg)), file=sys.stdout)
    sys.stdout.flush()
    
from contextlib import contextmanager

@contextmanager
def timer(*msg):
    before = datetime.datetime.now()
    try:
        yield
    except:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'errored after %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stderr)
        sys.stderr.flush()   
        raise
    else:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'done in %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stdout)
        sys.stdout.flush()

In [7]:
fasta_fn = '/home/jovyan/notebooks/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa'
genome = pyfasta.Fasta(fasta_fn)

In [8]:
dist_dn_template = os.path.join('dist_haps')
dist_fn_template = '{chrom}.{start:08d}.{stop:08d}.npy'

In [13]:
def compute_dxy_distance_matrices(chrom, window_size):
    dist_dn = dist_dn_template.format(metric='dxy', window_size=window_size)
    if not os.path.exists(dist_dn):
        os.makedirs(dist_dn)
        
    # open haplotypes
    callset = zarr.Group(gcsmap3, read_only=True)
    
    # load accessibility map
    is_accessible = accessibility[chrom]['is_accessible'][:]
    
    # determine accessible positions
    pos_accessible, = np.nonzero(is_accessible)
    
    # define equally accessible windows
    window_starts = pos_accessible[0:None:window_size]
    window_stops = pos_accessible[window_size-1:None:window_size]
    
    # add final window to end of chromosome
    window_starts = np.append(window_starts, [window_stops[-1] + 1])
    window_stops = np.append(window_stops, [len(genome[chrom])])
    
    # load variant positions
    pos = allel.SortedIndex(callset[chrom]['variants']['POS'][:])

    # iterate over windows
    for window_start, window_stop in zip(window_starts, window_stops):
        
        # distance matrix file name
        dist_fn = dist_fn_template.format(chrom=chrom, start=window_start, stop=window_stop)
        dist_path = os.path.join(dist_dn, dist_fn)
        
        # stay dry
        if os.path.exists(dist_path):
            log('skipping', dist_path)
            
        else:
            log('building', dist_path)
            gc.collect()
            
            with timer():
                
                # locate the window
                loc = pos.locate_range(window_start, window_stop)
                print (loc.start, loc.stop)

                # load data
                genotypes_phase2_call = calldata_hap_phase2['3R']["calldata/GT"]
                genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call[loc])
                haplotypes = genotypes_phase2.to_haplotypes()
                n_variants = genotypes_phase2.shape[0]
                log('variants:', n_variants)


                # compute hamming distance
                dist = allel.pairwise_distance(haplotypes[:], metric='hamming')
                log('hamming distance, max:', dist.max(), ', min:', dist.min())

                # adjust by accessible window size
                n_bases = np.count_nonzero(is_accessible[window_start:window_stop+1])
                log('window accessible size:', n_bases)
                dist = dist * n_variants / n_bases
                log('dxy distance, max:', dist.max(), ', min:', dist.min())

                # save
                np.save(dist_path, dist)

In [None]:
dxy_3L = compute_dxy_distance_matrices('3L', 50000)

building dist_haps/3L.00009778.00339867.npy
0 5178
variants: 5178
hamming distance, max: 0.0552336809578988 , min: 0.0
window accessible size: 50000
dxy distance, max: 0.00572 , min: 0.0
done in 24 seconds
building dist_haps/3L.00339868.00730787.npy
5178 10720
variants: 5542
hamming distance, max: 0.06459761818837964 , min: 0.0
window accessible size: 50000
dxy distance, max: 0.007159999999999999 , min: 0.0
done in 25 seconds
building dist_haps/3L.00730788.00873296.npy
10720 16243
variants: 5523
hamming distance, max: 0.07079485786710121 , min: 0.0
window accessible size: 50000
dxy distance, max: 0.00782 , min: 0.0
done in 25 seconds
building dist_haps/3L.00873297.01243275.npy
16243 22731
variants: 6488
hamming distance, max: 0.06858816276202219 , min: 0.0
window accessible size: 50000
dxy distance, max: 0.008899999999999998 , min: 0.0
done in 28 seconds
building dist_haps/3L.01243276.01490935.npy
22731 28505
variants: 5774
hamming distance, max: 0.056806373397990995 , min: 0.0
window 

In [None]:
dxy_3L = compute_dxy_distance_matrices('3R', 50000)