In [1]:
import pandas as pd
import numpy as np
import random
import allel
import zarr
import itertools
import requests
import os

In [2]:
! pip install -e ..

Obtaining file:///Users/s2341012/Dropbox/DISMaL_chapter/DISMaL
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: dismal
  Attempting uninstall: dismal
    Found existing installation: dismal 0.1
    Uninstalling dismal-0.1:
      Successfully uninstalled dismal-0.1
  Running setup.py develop for dismal
Successfully installed dismal-0.1


In [3]:
from dismal import preprocess

In [4]:
gff3 = "../test_data/brenthis_head1000.gff3"
vcf = "../test_data/brenthis_500K.vcf.gz"
zarrstore = "../test_data/zarrstore/"
callset = preprocess.CallSet(vcf, zarrstore)
samples_map = "../test_data/samples_map.csv"

In [5]:
blocks = preprocess.gff3_to_blocks(gff3, blocklen=100, ld_dist_bp=10_000)

In [6]:
callset = preprocess.CallSet(zarr_path="../test_data/zarrstore/")

In [19]:
snps_idxs = np.where((callset.pos >= 67286) & (callset.pos <= 67386))

In [99]:
test_gt = callset.gt[snps_idxs]

In [21]:
def block_genotype_array(callset, start, end):

    snps_idxs = np.where((callset.pos >= start) & (callset.pos <= end))
    return callset.gt[snps_idxs]


In [160]:
callset_samples = pd.DataFrame(callset.samples)
callset_samples = callset_samples.reset_index()
callset_samples.columns = ["callset_idx", "sample"]

In [161]:
samples.merge(callset_samples, on='sample')

Unnamed: 0,sample,population,callset_idx
0,SE_BI_1495,BI,0
1,FR_BI_1497,BI,1
2,FR_BD_1329,BD,2
3,RS_BI_1496,BI,3
4,RO_BD_956,BD,4
5,ES_BI_375,BI,5
6,UA_BI_1494,BI,6
7,ES_BD_1141,BD,7
8,ES_BI_364,BI,8
9,ES_BD_1489,BD,9


In [37]:
samples = pd.read_csv(samples_map)
samples_from_map = np.array(samples["sample"])

In [180]:
list(samples.set_index("sample").loc[callset.samples, :]["population"])

['BI', 'BI', 'BD', 'BI', 'BD', 'BI', 'BI', 'BD', 'BI', 'BD', 'BD', 'BD', 'BD']

In [187]:
pop_names = ["BI", "BD"]
pop_idx = [np.where(np.array(samples.set_index("sample").loc[callset.samples, :]["population"]) == pop_name) for pop_name in pop_names]

In [188]:
pop_idx

[(array([0, 1, 3, 5, 6, 8]),), (array([ 2,  4,  7,  9, 10, 11, 12]),)]

In [42]:
pops_idxs = [samples[samples["population"] == pop].index for pop in ["BI", "BD"]]

In [47]:
pops_idxs

[Int64Index([0, 1, 3, 5, 6, 8], dtype='int64'),
 Int64Index([2, 4, 7, 9, 10, 11, 12], dtype='int64')]

In [77]:
block_gt_arr = block_genotype_array(callset, 100_000, 100_100)
state = 1
samples = pd.read_csv(samples_map)
pops = list(samples["population"])

def subset_block_for_state(block_gt_arr, state, pops_idxs):
    """Subset a block to give a gt array of population 1 (state 1), population 2 (state 2), or one array of each (state 3)"""
    
    if state == 1 or state == 2:
        return (block_gt_arr[:, pops_idxs[state-1]], )
    else:
        assert state == 3
        return (block_gt_arr[:, pops_idxs[0]], block_gt_arr[:, pops_idxs[1]])


In [108]:
indx_nonmissing = [np.where([(block_gt_arr[:, i] >= 0).all() for i in range(len(block_gt_arr[0]))])][0]

In [109]:
indx_nonmissing

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),)

In [195]:
def index_individuals_wo_missing_sites(block_gt_arr, pops_idxs):
    """Return indices of individuals that have no missing sites in a block"""
    idxs_nonmissing = [np.where([(block_gt_arr[:, i] >= 0).all() 
                                 for i in range(len(block_gt_arr[0]))])]
    return idxs_nonmissing[0][0]
    

In [118]:
random.sample(range(0, 9), 1)

[4]

In [114]:
test_gt[0][0]

array([-1, -1], dtype=int8)

In [128]:
random.sample(range(0, len(block_gt_arr[0])), 1)

[1]

In [138]:
def sample_n_haps(block_gt_arr, n_samples):
    """Sample n haplotypes from block_gt_arr"""
    
    indiv_idx = random.sample(range(0, len(block_gt_arr[0])), n_samples)

    ploidy = len(block_gt_arr[0][0])
    hap_idx = random.sample(range(0, ploidy), n_samples)

    return block_gt_arr[:, [indiv_idx], hap_idx].transpose()


In [267]:
two = test_gt[:, [1, 2], 1]
two

hap1 = two[:, 0]
hap2 = two[:, 1]

hap1, hap2, two

(array([-1, -1,  1], dtype=int8),
 array([-1,  0,  1], dtype=int8),
 array([[-1, -1],
        [-1,  0],
        [ 1,  1]], dtype=int8))

In [152]:
def n_segr_sites(arr):
    return np.sum(arr[:, 0] != arr[:, 1])



In [206]:
index_individuals_wo_missing_sites(test_gt, pop_idx)

array([ 6,  7,  8,  9, 11])

In [207]:
np.intersect1d(pops_idxs[1], index_individuals_wo_missing_sites(test_gt, pop_idx))

array([ 7,  9, 11])

In [289]:
def blockwise_segregating_sites(blocks, callset, samples, sampling_probs):

    pops_names = [samples["population"][i] for i in range(len(samples["population"].unique()))]
    assert len(pop_names) == 2
    pops_idxs = [np.where(np.array(samples.set_index("sample").loc[callset.samples, :]["population"]) == pop_name) for pop_name in pops_names]

    rng = np.random.default_rng()
    
    for block_idx in range(len(blocks)):

        block_state = np.where(rng.multinomial(n=1, pvals=sampling_probs))[0][0] + 1

        block_start, block_end = blocks.loc[block_idx, "start"], blocks.loc[block_idx, "end"]
        block_gt_arr = block_genotype_array(callset, block_start, block_end)

        indivs_wo_missing_sites = index_individuals_wo_missing_sites(block_gt_arr, pops_idxs)
        pop1_nomissing = np.intersect1d(pops_idxs[0], indivs_wo_missing_sites)
        pop2_nomissing = np.intersect1d(pops_idxs[1], indivs_wo_missing_sites)

        if block_state == 1:
            if len(pop1_nomissing) < 2:
                continue
            else:
                samples_idx = rng.choice(pop1_nomissing, 2)
        elif block_state == 2:
            if len(pop2_nomissing) < 2:
                continue
            else:
                samples_idx = rng.choice(pop1_nomissing, 2)
        else:
            assert block_state == 3
            if len(pop1_nomissing) == 0 or len(pop2_nomissing) == 0:
                continue
            else:
                samples_idx = np.array([rng.choice(pop1_nomissing, 2), rng.choice(pop1_nomissing, 2)])

        ploidy = len(block_gt_arr[0][0])
        haplotype_idx = random.choice(range(0, ploidy))
        print(samples_idx, haplotype_idx)
        sample_genotypes = block_gt_arr[:, [samples_idx], haplotype_idx]

        return sample_genotypes


In [290]:
blockwise_segregating_sites(blocks, callset, samples=samples, sampling_probs=[0.5, 0.25, 0.25])[:, :, 0]

[[6 6]
 [6 8]] 0


array([[[0, 0]],

       [[0, 0]],

       [[1, 1]]], dtype=int8)

In [243]:

indivs_wo_missing_sites = index_individuals_wo_missing_sites(block_gt_arr, pops_idxs)
np.intersect1d(pops_idxs[0], indivs_wo_missing_sites)

array([0, 1, 3, 5, 6, 8])

In [250]:
rng.choice(np.intersect1d(pops_idxs[0], indivs_wo_missing_sites), 2)

array([5, 1])