## DXY pairwise calc

In [1]:
%run "imports.ipynb"

In [2]:
import sys
import gc
import datetime
import humanize
from humanize import naturalsize, intcomma, intword


def log(*msg):
    print(' '.join(map(str, msg)), file=sys.stdout)
    sys.stdout.flush()
    
from contextlib import contextmanager

@contextmanager
def timer(*msg):
    before = datetime.datetime.now()
    try:
        yield
    except:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'errored after %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stderr)
        sys.stderr.flush()   
        raise
    else:
        after = datetime.datetime.now()
        elapsed = (after - before).total_seconds()
        done = 'done in %s' % humanize.naturaldelta(elapsed)
        if not msg:
            msg = done
        else:
            msg = ', '.join(map(str, msg)) + ', ' + done
        print(msg, file=sys.stdout)
        sys.stdout.flush()

In [3]:
dist_dn_template = os.path.join('/bucket/dist_haps/')
dist_fn_template = '{chrom}.{start:08d}.{stop:08d}.npy'

In [4]:
def compute_dxy_distance_matrices(chrom, window_size):
    dist_dn = dist_dn_template.format(metric='dxy', window_size=window_size)
    if not os.path.exists(dist_dn):
        os.makedirs(dist_dn)
        
    
    # load accessibility map
    is_accessible = accessibility[chrom]['is_accessible'][:]
    
    # determine accessible positions
    pos_accessible, = np.nonzero(is_accessible)
    
    # define equally accessible windows
    window_starts = pos_accessible[0:None:window_size]
    window_stops = pos_accessible[window_size-1:None:window_size]
    
    # add final window to end of chromosome
    window_starts = np.append(window_starts, [window_stops[-1] + 1])
    window_stops = np.append(window_stops, [len(genome[chrom])])
    
    # load variant positions
    pos = allel.SortedIndex(calldata_hap_phase2[chrom]['variants']['POS'][:])

    # iterate over windows
    for window_start, window_stop in zip(window_starts, window_stops):
        
        # distance matrix file name
        dist_fn = dist_fn_template.format(chrom=chrom, start=window_start, stop=window_stop)
        dist_path = os.path.join(dist_dn, dist_fn)
        
        # stay dry
        if os.path.exists(dist_path):
            log('skipping', dist_path)
            
        else:
            log('building', dist_path)
            gc.collect()
            
            with timer():
                
                # locate the window
                loc = pos.locate_range(window_start, window_stop)
                print (loc.start, loc.stop)

                # load data
                genotypes_phase2_call = calldata_hap_phase2[chrom]["calldata/GT"]
                genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call[loc])
                haplotypes = genotypes_phase2.to_haplotypes()
                n_variants = genotypes_phase2.shape[0]
                log('variants:', n_variants)


                # compute hamming distance
                dist = allel.pairwise_distance(haplotypes[:], metric='hamming')
                log('hamming distance, max:', dist.max(), ', min:', dist.min())

                # adjust by accessible window size
                n_bases = np.count_nonzero(is_accessible[window_start:window_stop+1])
                log('window accessible size:', n_bases)
                dist = dist * n_variants / n_bases
                log('dxy distance, max:', dist.max(), ', min:', dist.min())

                # save
                np.save(dist_path, dist)

In [5]:
#compute_dxy_distance_matrices('3L', 100000)

In [6]:
compute_dxy_distance_matrices('3R', 100000)

building /bucket/dist_haps/3R.00000020.00134416.npy
0 20578
variants: 20578
hamming distance, max: 0.06205656526387404 , min: 0.0
window accessible size: 100000
dxy distance, max: 0.01277 , min: 0.0
done in a minute
building /bucket/dist_haps/3R.00134417.00243509.npy
20578 40624
variants: 20046
hamming distance, max: 0.05652000399082111 , min: 0.0
window accessible size: 100000
dxy distance, max: 0.01133 , min: 0.0
done in a minute
building /bucket/dist_haps/3R.00243510.00352301.npy
40624 64392
variants: 23768
hamming distance, max: 0.04842645573880848 , min: 0.0
window accessible size: 100000
dxy distance, max: 0.01151 , min: 0.0
done in a minute
building /bucket/dist_haps/3R.00352302.00473621.npy
64392 90063
variants: 25671
hamming distance, max: 0.0601067352265202 , min: 0.0
window accessible size: 100000
dxy distance, max: 0.01543 , min: 0.0
done in 2 minutes
building /bucket/dist_haps/3R.00473622.00577607.npy
90063 119286
variants: 29223
hamming distance, max: 0.036957191253464736

errored after 2 minutes


KeyboardInterrupt: 