In [76]:
import pandas as pd
import numpy as np
import random
import allel
import zarr
import itertools

In [2]:
gff = pd.read_table("/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3",
               nrows=1000,
               usecols=[0, 2, 3, 4],
               sep="\t",
               comment="#",
               header=None)

gff.columns = ["chr", "feature", "start", "end"]

In [3]:
gff

Unnamed: 0,chr,feature,start,end
0,brenthis_ino.SP_BI_364.chromosome_1,gene,18787,19695
1,brenthis_ino.SP_BI_364.chromosome_1,mRNA,18787,19695
2,brenthis_ino.SP_BI_364.chromosome_1,stop_codon,18787,18789
3,brenthis_ino.SP_BI_364.chromosome_1,CDS,18790,19695
4,brenthis_ino.SP_BI_364.chromosome_1,exon,18790,19695
...,...,...,...,...
995,brenthis_ino.SP_BI_364.chromosome_1,CDS,612082,612215
996,brenthis_ino.SP_BI_364.chromosome_1,exon,612082,612215
997,brenthis_ino.SP_BI_364.chromosome_1,intron,612216,612759
998,brenthis_ino.SP_BI_364.chromosome_1,CDS,612760,612892


In [4]:
def subset_introns(gff):
    return gff[gff.iloc[:,1] == "intron"]

subset_introns(gff)

Unnamed: 0,chr,feature,start,end
17,brenthis_ino.SP_BI_364.chromosome_1,intron,67286,68079
20,brenthis_ino.SP_BI_364.chromosome_1,intron,68221,69049
23,brenthis_ino.SP_BI_364.chromosome_1,intron,69129,69636
26,brenthis_ino.SP_BI_364.chromosome_1,intron,69744,70626
29,brenthis_ino.SP_BI_364.chromosome_1,intron,70787,71771
...,...,...,...,...
980,brenthis_ino.SP_BI_364.chromosome_1,intron,621115,621913
988,brenthis_ino.SP_BI_364.chromosome_1,intron,606552,609941
991,brenthis_ino.SP_BI_364.chromosome_1,intron,610112,611616
994,brenthis_ino.SP_BI_364.chromosome_1,intron,611698,612081


In [5]:
def subset_length(gff, length):
    return gff[gff.iloc[:,3] - gff.iloc[:,2] > length]

subset_length(gff, 100)

Unnamed: 0,chr,feature,start,end
0,brenthis_ino.SP_BI_364.chromosome_1,gene,18787,19695
1,brenthis_ino.SP_BI_364.chromosome_1,mRNA,18787,19695
3,brenthis_ino.SP_BI_364.chromosome_1,CDS,18790,19695
4,brenthis_ino.SP_BI_364.chromosome_1,exon,18790,19695
6,brenthis_ino.SP_BI_364.chromosome_1,gene,23600,23995
...,...,...,...,...
995,brenthis_ino.SP_BI_364.chromosome_1,CDS,612082,612215
996,brenthis_ino.SP_BI_364.chromosome_1,exon,612082,612215
997,brenthis_ino.SP_BI_364.chromosome_1,intron,612216,612759
998,brenthis_ino.SP_BI_364.chromosome_1,CDS,612760,612892


In [6]:
def filter_ld(gff, ld_bp, sort=True):
    #if sort:
     #   gff = gff.sort_values(by=["start", "end"])
    
    filtered = np.where([gff.iloc[row, -2] - gff.iloc[row-1, -1] > ld_bp for row in range(1, len(gff)-1)])

    return gff.iloc[filtered[0], :]

In [7]:
filtered_idx = filter_ld(gff, 1000)

In [8]:
def trim_blocks(gff, length):
    start_indices = [random.randint(gff.iloc[row, -2], (gff.iloc[row, -1]-length)) for row in range(0, len(gff))]
    end_indices = np.array(start_indices) + length+1
    return pd.DataFrame({"chr":gff.iloc[:,0], "start":start_indices, "end":end_indices})

In [9]:
introns = subset_introns(gff)
intronsmin100 = subset_length(introns, 100)
introns_ld = filter_ld(intronsmin100, 1000)
trim_blocks(introns_ld, 100)

Unnamed: 0,chr,start,end
35,brenthis_ino.SP_BI_364.chromosome_1,73856,73957
50,brenthis_ino.SP_BI_364.chromosome_1,131035,131136
59,brenthis_ino.SP_BI_364.chromosome_1,219422,219523
127,brenthis_ino.SP_BI_364.chromosome_1,237763,237864
154,brenthis_ino.SP_BI_364.chromosome_1,266793,266894
163,brenthis_ino.SP_BI_364.chromosome_1,269754,269855
187,brenthis_ino.SP_BI_364.chromosome_1,280285,280386
199,brenthis_ino.SP_BI_364.chromosome_1,305309,305410
359,brenthis_ino.SP_BI_364.chromosome_1,351733,351834
368,brenthis_ino.SP_BI_364.chromosome_1,364889,364990


In [10]:
def make_block_indices(gff, blocklength, ld_bp, nrows=None):

    gff = pd.read_table(gff,
               nrows=nrows,
               usecols=[0, 2, 3, 4],
               sep="\t",
               comment="#",
               header=None)

    gff.columns = ["chr", "feature", "start", "end"]

    introns = subset_introns(gff)
    introns_filtered_len = subset_length(introns, blocklength)
    introns_filtered_ld = filter_ld(introns_filtered_len, ld_bp)
    block_indices = trim_blocks(introns_filtered_ld, blocklength)
    block_indices["block_id"] = block_indices["chr"] + ":" + block_indices["start"].astype(str) + "-" + block_indices["end"].astype(str)

    return block_indices

In [11]:
blocks = make_block_indices("/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino.SP_BI_364.v2_0.sequences.braker.gt.gff3", 100, 10000)

In [12]:
blocks

Unnamed: 0,chr,start,end,block_id
35,brenthis_ino.SP_BI_364.chromosome_1,73681,73782,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
50,brenthis_ino.SP_BI_364.chromosome_1,132089,132190,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
127,brenthis_ino.SP_BI_364.chromosome_1,237671,237772,brenthis_ino.SP_BI_364.chromosome_1:237671-237772
187,brenthis_ino.SP_BI_364.chromosome_1,280529,280630,brenthis_ino.SP_BI_364.chromosome_1:280529-280630
359,brenthis_ino.SP_BI_364.chromosome_1,351315,351416,brenthis_ino.SP_BI_364.chromosome_1:351315-351416
...,...,...,...,...
483342,brenthis_ino.SP_BI_364.scaffold_2,404,505,brenthis_ino.SP_BI_364.scaffold_2:404-505
483447,brenthis_ino.SP_BI_364.scaffold_5,11748,11849,brenthis_ino.SP_BI_364.scaffold_5:11748-11849
483477,brenthis_ino.SP_BI_364.scaffold_5,45589,45690,brenthis_ino.SP_BI_364.scaffold_5:45589-45690
483621,brenthis_ino.SP_BI_364.scaffold_8,40568,40669,brenthis_ino.SP_BI_364.scaffold_8:40568-40669


VCF

In [13]:
vcf_path = "/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/brenthis_ino_daphne.vcf.gz"
zarr_path = "/Users/s2341012/Dropbox/DISMaL_chapter/brenthis_test/brenthis_data/zarrstore"

try:
    allel.vcf_to_zarr(vcf_path, zarr_path,
                          fields=["samples", "calldata/GT", "variants/CHROM", "variants/POS"], overwrite=False)
except Exception: # if store already exists, proceed
    pass 

callset = zarr.open_group(zarr_path, mode='r')
chromosomes = callset["variants/CHROM"][:]
samples = callset["samples"][:]
gt = callset["calldata/GT"][:]
pos = callset["variants/POS"][:]



In [14]:
vcf_positions = pd.DataFrame({"chr":chromosomes, "pos":pos})[0:100000]

In [15]:
vcf_positions

Unnamed: 0,chr,pos
0,brenthis_ino.SP_BI_364.chromosome_1,7731
1,brenthis_ino.SP_BI_364.chromosome_1,7817
2,brenthis_ino.SP_BI_364.chromosome_1,8099
3,brenthis_ino.SP_BI_364.chromosome_1,8157
4,brenthis_ino.SP_BI_364.chromosome_1,8223
...,...,...
99995,brenthis_ino.SP_BI_364.chromosome_1,1424563
99996,brenthis_ino.SP_BI_364.chromosome_1,1424579
99997,brenthis_ino.SP_BI_364.chromosome_1,1424584
99998,brenthis_ino.SP_BI_364.chromosome_1,1424586


In [16]:
vcf_positions["gt_idx"] = vcf_positions.index
vcf_positions

Unnamed: 0,chr,pos,gt_idx
0,brenthis_ino.SP_BI_364.chromosome_1,7731,0
1,brenthis_ino.SP_BI_364.chromosome_1,7817,1
2,brenthis_ino.SP_BI_364.chromosome_1,8099,2
3,brenthis_ino.SP_BI_364.chromosome_1,8157,3
4,brenthis_ino.SP_BI_364.chromosome_1,8223,4
...,...,...,...
99995,brenthis_ino.SP_BI_364.chromosome_1,1424563,99995
99996,brenthis_ino.SP_BI_364.chromosome_1,1424579,99996
99997,brenthis_ino.SP_BI_364.chromosome_1,1424584,99997
99998,brenthis_ino.SP_BI_364.chromosome_1,1424586,99998


In [17]:
block_snps = [vcf_positions[vcf_positions["pos"].between(blocks.iloc[row, 1], blocks.iloc[row, 2])] for row in range(len(blocks))]

In [18]:
blocks_snps_df = pd.concat(block_snps)
blocks_snps_df

Unnamed: 0,chr,pos,gt_idx
1595,brenthis_ino.SP_BI_364.chromosome_1,73696,1595
1596,brenthis_ino.SP_BI_364.chromosome_1,73762,1596
5879,brenthis_ino.SP_BI_364.chromosome_1,132109,5879
5880,brenthis_ino.SP_BI_364.chromosome_1,132118,5880
5881,brenthis_ino.SP_BI_364.chromosome_1,132125,5881
...,...,...,...
15838,brenthis_ino.SP_BI_364.chromosome_1,268252,15838
15839,brenthis_ino.SP_BI_364.chromosome_1,268262,15839
15840,brenthis_ino.SP_BI_364.chromosome_1,268266,15840
368,brenthis_ino.SP_BI_364.chromosome_1,45641,368


In [19]:
blocks_snps_df

Unnamed: 0,chr,pos,gt_idx
1595,brenthis_ino.SP_BI_364.chromosome_1,73696,1595
1596,brenthis_ino.SP_BI_364.chromosome_1,73762,1596
5879,brenthis_ino.SP_BI_364.chromosome_1,132109,5879
5880,brenthis_ino.SP_BI_364.chromosome_1,132118,5880
5881,brenthis_ino.SP_BI_364.chromosome_1,132125,5881
...,...,...,...
15838,brenthis_ino.SP_BI_364.chromosome_1,268252,15838
15839,brenthis_ino.SP_BI_364.chromosome_1,268262,15839
15840,brenthis_ino.SP_BI_364.chromosome_1,268266,15840
368,brenthis_ino.SP_BI_364.chromosome_1,45641,368


In [20]:
# need to make this chr, pos, block

chr_pos_block = []

blocks_snps_df_ = blocks_snps_df.reset_index()

for row in range(len(blocks_snps_df_)):
    position = blocks_snps_df_.iloc[row, 2]
    chr_block = blocks[(blocks["start"].astype(int) <= position) & (blocks["end"].astype(int) >= position)][["chr", "block_id"]]
    chr_pos_block.append((chr_block.iloc[0][0], position, chr_block.iloc[0][1]))


In [21]:
chr_blocks = pd.DataFrame(chr_pos_block)
chr_blocks.columns = ["chr", "pos", "block"]
chr_blocks

Unnamed: 0,chr,pos,block
0,brenthis_ino.SP_BI_364.chromosome_1,73696,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
1,brenthis_ino.SP_BI_364.chromosome_1,73762,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
2,brenthis_ino.SP_BI_364.chromosome_1,132109,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
3,brenthis_ino.SP_BI_364.chromosome_1,132118,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
4,brenthis_ino.SP_BI_364.chromosome_1,132125,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
...,...,...,...
1658,brenthis_ino.SP_BI_364.scaffold_1,268252,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1659,brenthis_ino.SP_BI_364.scaffold_1,268262,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1660,brenthis_ino.SP_BI_364.scaffold_1,268266,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1661,brenthis_ino.SP_BI_364.scaffold_5,45641,brenthis_ino.SP_BI_364.scaffold_5:45589-45690


In [22]:
def get_block_positions(chromosomes, pos, blocks):
    """Get DF [chr, pos, block]"""
    positions = pd.DataFrame({"chr":chromosomes, "pos":pos})[0:100000]
    block_snps = pd.concat([positions[positions["pos"].between(blocks.iloc[row, 1],
                                                                blocks.iloc[row, 2])]
                                                                  for row in range(len(blocks))])
  
    chr_pos_block = []

    blocks_snps_df_ = blocks_snps_df.reset_index()

    for row in range(len(blocks_snps_df_)):
      position = blocks_snps_df_.iloc[row, 2]
      chr_block = blocks[(blocks["start"].astype(int) <= position) 
                         & (blocks["end"].astype(int) >= position)][["chr", "block_id"]]
      chr_pos_block.append((chr_block.iloc[0][0], position, chr_block.iloc[0][1]))

    chr_blocks = pd.DataFrame(chr_pos_block)
    chr_blocks.columns = ["chr", "pos", "block"]
      
    return chr_blocks

In [23]:
get_block_positions(chromosomes, pos, blocks)

Unnamed: 0,chr,pos,block
0,brenthis_ino.SP_BI_364.chromosome_1,73696,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
1,brenthis_ino.SP_BI_364.chromosome_1,73762,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
2,brenthis_ino.SP_BI_364.chromosome_1,132109,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
3,brenthis_ino.SP_BI_364.chromosome_1,132118,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
4,brenthis_ino.SP_BI_364.chromosome_1,132125,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
...,...,...,...
1658,brenthis_ino.SP_BI_364.scaffold_1,268252,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1659,brenthis_ino.SP_BI_364.scaffold_1,268262,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1660,brenthis_ino.SP_BI_364.scaffold_1,268266,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1661,brenthis_ino.SP_BI_364.scaffold_5,45641,brenthis_ino.SP_BI_364.scaffold_5:45589-45690


In [25]:
chr_blocks

Unnamed: 0,chr,pos,block
0,brenthis_ino.SP_BI_364.chromosome_1,73696,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
1,brenthis_ino.SP_BI_364.chromosome_1,73762,brenthis_ino.SP_BI_364.chromosome_1:73681-73782
2,brenthis_ino.SP_BI_364.chromosome_1,132109,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
3,brenthis_ino.SP_BI_364.chromosome_1,132118,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
4,brenthis_ino.SP_BI_364.chromosome_1,132125,brenthis_ino.SP_BI_364.chromosome_1:132089-132190
...,...,...,...
1658,brenthis_ino.SP_BI_364.scaffold_1,268252,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1659,brenthis_ino.SP_BI_364.scaffold_1,268262,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1660,brenthis_ino.SP_BI_364.scaffold_1,268266,brenthis_ino.SP_BI_364.scaffold_1:268193-268294
1661,brenthis_ino.SP_BI_364.scaffold_5,45641,brenthis_ino.SP_BI_364.scaffold_5:45589-45690


In [26]:
blockpos_idx = pd.merge(chr_blocks, vcf_positions, how="left", on=["chr", "pos"]).dropna()
blockpos_idx

Unnamed: 0,chr,pos,block,gt_idx,idx
0,brenthis_ino.SP_BI_364.chromosome_1,73696,brenthis_ino.SP_BI_364.chromosome_1:73681-73782,1595.0,1595.0
1,brenthis_ino.SP_BI_364.chromosome_1,73762,brenthis_ino.SP_BI_364.chromosome_1:73681-73782,1596.0,1596.0
2,brenthis_ino.SP_BI_364.chromosome_1,132109,brenthis_ino.SP_BI_364.chromosome_1:132089-132190,5879.0,5879.0
3,brenthis_ino.SP_BI_364.chromosome_1,132118,brenthis_ino.SP_BI_364.chromosome_1:132089-132190,5880.0,5880.0
4,brenthis_ino.SP_BI_364.chromosome_1,132125,brenthis_ino.SP_BI_364.chromosome_1:132089-132190,5881.0,5881.0
...,...,...,...,...,...
122,brenthis_ino.SP_BI_364.chromosome_1,1405453,brenthis_ino.SP_BI_364.chromosome_1:1405387-14...,98631.0,98631.0
123,brenthis_ino.SP_BI_364.chromosome_1,1405457,brenthis_ino.SP_BI_364.chromosome_1:1405387-14...,98632.0,98632.0
124,brenthis_ino.SP_BI_364.chromosome_1,1405459,brenthis_ino.SP_BI_364.chromosome_1:1405387-14...,98633.0,98633.0
125,brenthis_ino.SP_BI_364.chromosome_1,1405483,brenthis_ino.SP_BI_364.chromosome_1:1405387-14...,98634.0,98634.0


In [54]:
blk = pd.DataFrame(gt[1595], columns=["hap1", "hap2"], index=samples)
blk["sample"] = blk.index
blk_melt = blk.melt(id_vars="sample")
blk_out = pd.DataFrame({"id": blk_melt["sample"] + "_" + blk_melt["variable"], "gt": blk_melt["value"]})
blk_out


Unnamed: 0,id,gt
0,SE_BI_1495_hap1,0
1,FR_BI_1497_hap1,0
2,FR_BD_1329_hap1,0
3,RS_BI_1496_hap1,0
4,RO_BD_956_hap1,0
5,ES_BI_375_hap1,0
6,UA_BI_1494_hap1,0
7,ES_BD_1141_hap1,0
8,ES_BI_364_hap1,0
9,ES_BD_1489_hap1,0


In [70]:
def site_gt(gt_index, gt, samples):
    site = pd.DataFrame(gt[gt_index], columns=["hap1", "hap2"], index=samples)
    site["sample"] = site.index
    site_melted = blk.melt(id_vars="sample")
    return pd.DataFrame({"id": site_melted["sample"] + "_" + site_melted["variable"], "gt": site_melted["value"]})


In [92]:
list(range(1595, 1598))

[1595, 1596, 1597]

In [93]:
def block_callset_df(gt, gt_idxs, samples_list, ploidy):
    haps = pd.DataFrame(list(itertools.product(samples_list, list(range(ploidy)))),
                        columns=["sample", "hap"])
    
    for idx in gt_idxs:
        haps[idx] = gt[idx].flatten()

    return haps

block_callset_df(gt, list(range(1595, 1598)), samples, 2)

Unnamed: 0,sample,hap,1595,1596,1597
0,SE_BI_1495,0,0,0,0
1,SE_BI_1495,1,0,0,0
2,FR_BI_1497,0,0,0,0
3,FR_BI_1497,1,0,0,1
4,FR_BD_1329,0,0,1,1
5,FR_BD_1329,1,0,1,1
6,RS_BI_1496,0,0,0,0
7,RS_BI_1496,1,1,0,1
8,RO_BD_956,0,0,1,1
9,RO_BD_956,1,0,1,1


In [71]:
site_gt(1595, gt, samples)

Unnamed: 0,id,gt
0,SE_BI_1495_hap1,0
1,FR_BI_1497_hap1,0
2,FR_BD_1329_hap1,0
3,RS_BI_1496_hap1,0
4,RO_BD_956_hap1,0
5,ES_BI_375_hap1,0
6,UA_BI_1494_hap1,0
7,ES_BD_1141_hap1,0
8,ES_BI_364_hap1,0
9,ES_BD_1489_hap1,0


In [74]:
gt[1596].flatten()

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1], dtype=int8)

In [27]:
rng = np.random.default_rng()
blocks["sampling_state"] = np.where(rng.multinomial(n=1, pvals=[0.25, 0.25, 0.5], size=len(blocks)) == 1)[1] + 1

In [28]:
blocks

Unnamed: 0,chr,start,end,block_id,sampling_state
35,brenthis_ino.SP_BI_364.chromosome_1,73681,73782,brenthis_ino.SP_BI_364.chromosome_1:73681-73782,3
50,brenthis_ino.SP_BI_364.chromosome_1,132089,132190,brenthis_ino.SP_BI_364.chromosome_1:132089-132190,1
127,brenthis_ino.SP_BI_364.chromosome_1,237671,237772,brenthis_ino.SP_BI_364.chromosome_1:237671-237772,1
187,brenthis_ino.SP_BI_364.chromosome_1,280529,280630,brenthis_ino.SP_BI_364.chromosome_1:280529-280630,3
359,brenthis_ino.SP_BI_364.chromosome_1,351315,351416,brenthis_ino.SP_BI_364.chromosome_1:351315-351416,1
...,...,...,...,...,...
483342,brenthis_ino.SP_BI_364.scaffold_2,404,505,brenthis_ino.SP_BI_364.scaffold_2:404-505,1
483447,brenthis_ino.SP_BI_364.scaffold_5,11748,11849,brenthis_ino.SP_BI_364.scaffold_5:11748-11849,1
483477,brenthis_ino.SP_BI_364.scaffold_5,45589,45690,brenthis_ino.SP_BI_364.scaffold_5:45589-45690,3
483621,brenthis_ino.SP_BI_364.scaffold_8,40568,40669,brenthis_ino.SP_BI_364.scaffold_8:40568-40669,3


In [64]:
samples_df = pd.DataFrame(
    {"sample": samples,
     "population": ["BI", "BI", "BD", "BI", "BD", "BI", "BI", "BD", "BI", "BD", "BD", "BD", "BD"]}
)

samples_df

Unnamed: 0,sample,population
0,SE_BI_1495,BI
1,FR_BI_1497,BI
2,FR_BD_1329,BD
3,RS_BI_1496,BI
4,RO_BD_956,BD
5,ES_BI_375,BI
6,UA_BI_1494,BI
7,ES_BD_1141,BD
8,ES_BI_364,BI
9,ES_BD_1489,BD


In [65]:
block_id = blocks.iloc[0,3]
state = blocks.iloc[0,4]

populations = samples_df["population"].unique()
pop1_df = samples_df["population"] = populations[0]
pop2_df = samples_df["population"] = populations[1]

# select samples
if state == 1:
    


In [66]:
populations

array(['BI', 'BD'], dtype=object)

In [59]:
samples

array(['SE_BI_1495', 'FR_BI_1497', 'FR_BD_1329', 'RS_BI_1496',
       'RO_BD_956', 'ES_BI_375', 'UA_BI_1494', 'ES_BD_1141', 'ES_BI_364',
       'ES_BD_1489', 'ES_BD_1490', 'GR_BD_1491', 'IT_BD_1493'],
      dtype=object)

In [89]:
block_indices = [int(val) for val in blockpos_idx[blockpos_idx["block"] == blockpos_idx["block"][0]]["idx"]]

In [91]:
block0 = gt[block_indices[0]:block_indices[-1]]
block0

array([[[0, 0],
        [0, 1],
        [0, 0],
        [0, 1],
        [0, 0],
        [0, 1],
        [0, 1],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 1]],

       [[0, 0],
        [0, 0],
        [0, 0],
        [0, 1],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]],

       [[0, 1],
        [0, 0],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 1],
        [1, 1],
        [0, 0],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1]],

       [[0, 1],
        [0, 0],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 1],
        [1, 1],
        [0, 0],
        [1, 1],


In [108]:
#block0[site][individual][hap]
site0 = list(zip(samples, block0[0]))

nomissing = block0[0] >= 0

In [116]:
samples_wo_missing = np.where(samples[nomissing[:,0] + nomissing[:,1]])
samples_wo_missing[0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [None]:
def filter_missing_in_block(block_id, gt)