In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from numba import njit
import numpy as np
%run tools.py

In [2]:
@njit()
def scanRight(geno1, geno2, upperBreakpoint, max): 
    # Reset position for next loop
    gn1 = geno1[upperBreakpoint]
    gn2 = geno2[upperBreakpoint]

    # Scan right along genome, as long as two inds are not both homozygous but different
    while not (gn1[0] == gn1[1]) & (gn2[0] == gn2[1]) & ((gn1 != gn2).all()) & (-1 not in gn1 and -1 not in gn2):

        upperBreakpoint += 1
        if upperBreakpoint == max-1: # limit the upper breakpoint at end of the contig
            return(upperBreakpoint)

        gn1 = geno1[upperBreakpoint]
        gn2 = geno2[upperBreakpoint]
        
    return(upperBreakpoint)

@njit()
def scanLeft(geno1, geno2, lowerBreakpoint): 

    # subset genotypes to this position, because we need somethign to start the while loop?
    gn1 = geno1[lowerBreakpoint]
    gn2 = geno2[lowerBreakpoint]

    # Scan left along genome
    while not (gn1[0] == gn1[1]) & (gn2[0] == gn2[1]) & ((gn1 != gn2).all()) & (-1 not in gn1 and -1 not in gn2):

        lowerBreakpoint -= 1
        if lowerBreakpoint == 0: # limit lower breakpoint at zero, start of contig
            return(lowerBreakpoint)

        gn1 = geno1[lowerBreakpoint]
        gn2 = geno2[lowerBreakpoint]
        
    return(lowerBreakpoint)


@njit()
def f2scans(dblton_arr, snps, pos):

    starts = []# np.empty((len(dblton_arr)),dtype='uint8')
    ends = []# np.empty((len(dblton_arr)),dtype='uint8')
    dbltonpos = []

    for idx in range(0, len(dblton_arr)):

        geno1 = snps[:, dblton_arr[idx][0]]
        geno2 = snps[:, dblton_arr[idx][1]]
        # get boolean of dblton idx
        #dblton_idx = bisect.bisect_left(pos, dblton_arr[idx][2])
        dblton_idx = np.searchsorted(pos, dblton_arr[idx][2])

        upperBreakpoint = scanRight(geno1, geno2, dblton_idx, len(pos))
        # Scan left along genome
        lowerBreakpoint = scanLeft(geno1, geno2, dblton_idx)

        starts.append(pos[lowerBreakpoint])
        ends.append(pos[upperBreakpoint])
        dbltonpos.append(dblton_arr[idx][2])
    
    return(np.array(starts), np.array(ends), np.array(dbltonpos))

In [3]:
metadata = pd.read_csv("../../config/metadata.tsv", sep="\t")
dblton = pd.read_csv("../../results/f2variantPairs.tsv", sep="\t")

In [4]:

contigs = ['2L', '2R', '3L', '3R', 'X']


In [6]:
    # Load Arrays
contig = 'X'

snps, pos = loadZarrArrays(genotypePath=f"../../resources/snp_genotypes/all/1244-VO-GH-YAWSON-VMF00149/{contig}/calldata/GT/", 
                                        positionsPath=f"../../resources/snp_genotypes/all/sites/{contig}/variants/POS/",
                                        siteFilterPath=f"../../resources/site_filters/dt_20200416/gamb_colu/{contig}/variants/filter_pass/")


In [7]:
singletons = snps.count_alleles().is_singleton()

In [10]:
singletons = singletons.compute()
pos_sin = pos[singletons]

In [None]:
snps.compress(singletons)

In [11]:
pos_sin

0,1,2,3,4,...,673171,673172,673173,673174,673175
30,53,58,128,165,...,24390291,24390293,24390312,24390481,24392596


In [6]:


for contig in contigs:
    snps = {}
    pos = {}

    # Load Arrays
    snps, pos = loadZarrArrays(genotypePath=f"../../resources/snp_genotypes/all/1244-VO-GH-YAWSON-VMF00149/{contig}/calldata/GT/", 
                                            positionsPath=f"../../resources/snp_genotypes/all/sites/{contig}/variants/POS/",
                                            siteFilterPath=f"../../resources/site_filters/dt_20200416/gamb_colu/{contig}/variants/filter_pass/")
    

    ac = snps.count_alleles()
    seg = ac.is_segregating()
    snps = snps.compress(seg, axis=0).compute().values
    pos = pos[seg]
    ### Load doubletons

    dblton_arr = dblton.query("contig == @contig")[['idx1', 'idx2', 'pos']].to_numpy()
    log(contig, dblton_arr.shape)

    log(f"Scanning {contig} doubletons")
    starts, ends, dbltonpos = f2scans(dblton_arr, snps, pos)
    log("Done...")
    f2_df = pd.DataFrame({'start':starts, 'end':ends, 'dblton_pos': dbltonpos})
    f2_df.to_csv(f"../../results/f2variants/f2HapLengths.{contig}.tsv", sep="\t")

2L (439698, 3)
Scanning 2L doubletons
Done...
2R (576303, 3)
Scanning 2R doubletons
Done...
3L (440880, 3)
Scanning 3L doubletons
Done...
3R (615762, 3)
Scanning 3R doubletons
Done...
X (209303, 3)
Scanning X doubletons
Done...


#### Testing the functions

In [5]:
qpos = np.array([0,10,20,30,40,50,60,70,80,90])
dbpos = 50
geno1 = np.array([[0,0], [-1,-1], [0,1], [0,0], [0,0], [0,1], [0,0], [0,0], [-1,-1], [0, 0]])
geno2 = np.array([[1,1], [-1,-1], [0,0], [0,0], [0,0], [0,1], [0,0], [0,0], [0,0], [1, 1]])
# get boolean of dblton idx
#dblton_idx = bisect.bisect_left(pos, dblton_arr[idx][2])
dblton_idx = np.searchsorted(qpos, dbpos)
a = scanLeft(geno1, geno2, dblton_idx)
b = scanRight(geno1, geno2, dblton_idx, 10)
print(a,b)

0 9


In [8]:
contig = '2L'
# Load Arrays
snps, pos = loadZarrArrays(genotypePath=f"../../resources/snp_genotypes/all/1244-VO-GH-YAWSON-VMF00149/{contig}/calldata/GT/", 
                                        positionsPath=f"../../resources/snp_genotypes/all/sites/{contig}/variants/POS/",
                                        siteFilterPath=f"../../resources/site_filters/dt_20200416/gamb_colu/{contig}/variants/filter_pass/")
        


In [10]:
ac = snps.count_alleles()
seg = ac.is_segregating()
snps = snps.compress(seg, axis=0).compute().values
pos = pos[seg]
### Load doubletons
dblton_arr = dblton.query("contig == @contig")[['idx1', 'idx2', 'pos']].to_numpy()

In [11]:
%%timeit
starts, ends, dbltonpos = f2scans(dblton_arr[250_000:250_100], snps, pos)

218 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
ends-starts

In [162]:
idx1 = dblton_arr[1][0]
idx2 = dblton_arr[1][1]
posd = dblton_arr[1][2]

In [12]:
 %%timeit
starts, ends, dbltonpos = f2scans(dblton_arr[250_000:250_100], snps, pos)

200 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
snps = snps.values

#### Timings

In [21]:

def speedup(fullnj, half, naive):

    print(f"naive / half speedup: {naive/half}")
    print(f"half/full speedup: {half/fullnj}")
    print(f"overall speedup: {naive/fullnj}")

In [24]:
speedup(0.0679, 1.31, 15.4)

naive / half speedup: 11.755725190839694
half/full speedup: 19.293078055964653
overall speedup: 226.8041237113402


In [None]:
site_filter = zarr.open_array(f"../../resources/site_filters/dt_20200416/gamb_colu/{contig}/variants/filter_pass/")

In [29]:
dblton

Unnamed: 0,contig,idx1,idx2,partner_sample_id,partner_sample_id2,pos,latitude,longitude,latitude2,longitude2
0,2L,3,198,WA-2151,WA-2225,81,6.304,-1.682,6.058,-1.391
1,2L,282,346,WA-2275,WA-2113,82,5.970,-1.697,6.016,-1.813
2,2L,3,282,WA-2151,WA-2275,123,6.304,-1.682,5.970,-1.697
3,2L,324,327,WA-2293,WA-2296,142,5.971,-1.779,5.971,-1.778
4,2L,293,338,WA-2286,WA-2105,145,5.970,-1.697,6.016,-1.812
...,...,...,...,...,...,...,...,...,...,...
715687,2L,295,372,WA-2433,WA-2467,49361656,5.970,-1.697,6.015,-1.813
715688,2L,232,334,WA-2073,WA-2303,49361661,5.991,-1.370,5.970,-1.779
715689,2L,436,442,WA-2336,WA-2481,49361694,6.049,-1.961,6.047,-1.961
715690,2L,155,180,WA-2404,WA-2051,49361932,6.369,-1.464,6.108,-1.449


In [26]:
%%timeit
f2hapSize = {}

for row in dblton[250_000:250_100].itertuples(name=None):
    ## subset to individuals, we need whole chrom atm
    # subset genotypes to these individuals
    geno1 = snps[:, row[2]]
    geno2 = snps[:, row[3]]
    # get boolean of dblton idx
    dblton_idx = np.where(pos == row[6])[0][0]

    upperBreakpoint = scanRight(geno1, geno2, dblton_idx, pos.shape[0])

    # Scan left along genome
    lowerBreakpoint = scanLeft(geno1, geno2, dblton_idx)

    start = pos[lowerBreakpoint]
    end = pos[upperBreakpoint]
    f2hapSize[f"{contig}_{row[6]}"] = end-start  # get size in bp of f2 haplotype

IndexError: index 0 is out of bounds for axis 0 with size 0

In [25]:
dblton


Unnamed: 0,contig,idx1,idx2,partner_sample_id,partner_sample_id2,pos,latitude,longitude,latitude2,longitude2
0,2L,3,198,WA-2151,WA-2225,81,6.304,-1.682,6.058,-1.391
1,2L,282,346,WA-2275,WA-2113,82,5.970,-1.697,6.016,-1.813
2,2L,3,282,WA-2151,WA-2275,123,6.304,-1.682,5.970,-1.697
3,2L,324,327,WA-2293,WA-2296,142,5.971,-1.779,5.971,-1.778
4,2L,293,338,WA-2286,WA-2105,145,5.970,-1.697,6.016,-1.812
...,...,...,...,...,...,...,...,...,...,...
715687,2L,295,372,WA-2433,WA-2467,49361656,5.970,-1.697,6.015,-1.813
715688,2L,232,334,WA-2073,WA-2303,49361661,5.991,-1.370,5.970,-1.779
715689,2L,436,442,WA-2336,WA-2481,49361694,6.049,-1.961,6.047,-1.961
715690,2L,155,180,WA-2404,WA-2051,49361932,6.369,-1.464,6.108,-1.449


In [14]:
snps = allel.GenotypeArray(snps)
contig = '2L'
dblton = dblton.query("contig == @contig")


In [15]:
%%timeit
f2hapSize = {}

for idx, row in dblton[250_000:250_100].iterrows():

    # take one doubleton at a time, get contig, pos, individual1 and 2
    contig, dbltonpos, idx1, idx2 = row[['contig', 'pos', 'idx1', 'idx2']]
    ## subset to individuals, we need whole chrom atm

    # subset genotypes to these individuals
    geno1 = snps[:, idx1]
    geno2 = snps[:, idx2]
    # get boolean of dblton idx
    dblton_idx = np.where(pos == dbltonpos)[0][0]

    lowerBreakpoint = dblton_idx.copy()
    upperBreakpoint = dblton_idx.copy()

    # Reset position for next loop
    gn1 = geno1[dblton_idx]
    gn2 = geno2[dblton_idx]
    # Scan right along genome, as long as two inds are not both homozygous but different
    while not (gn1[0] == gn1[1]) & (gn2[0] == gn2[1]) & ((gn1 != gn2).all()) & (-1 not in gn1 and -1 not in gn2):

        upperBreakpoint += 1
        if upperBreakpoint == max: # limit the upper breakpoint at end of the contig
            break

        gn1 = geno1[upperBreakpoint]
        gn2 = geno2[upperBreakpoint]

    # Scan left along genome
    # subset genotypes to this position, because we need somethign to start the while loop?
    gn1 = geno1[dblton_idx]
    gn2 = geno2[dblton_idx]

    # Scan left along genome
    while not (gn1[0] == gn1[1]) & (gn2[0] == gn2[1]) & ((gn1 != gn2).all()) & (-1 not in gn1 and -1 not in gn2):

        lowerBreakpoint -= 1
        if lowerBreakpoint == 0: # limit lower breakpoint at zero, start of contig
            break

        gn1 = geno1[lowerBreakpoint]
        gn2 = geno2[lowerBreakpoint]
    

    start = pos[lowerBreakpoint]
    end = pos[upperBreakpoint]
    f2hapSize[f"{contig}_{dbltonpos}"] = end-start  # get size in bp of f2 haplotype

IndexError: index 0 is out of bounds for axis 0 with size 0

### Hap Length Analysis

Where do we find outliers? Lets plot f2 haplotype size against f2 haplotype (start) position

In [44]:
f2df = pd.concat(f2Haps, axis=0).reset_index(drop=True)

In [46]:
f2df

Unnamed: 0,start,end,dblton_pos,contig,size
0,16,4689,81,2L,4673
1,16,4652,82,2L,4636
2,16,9158,123,2L,9142
3,16,12351,142,2L,12335
4,16,399,145,2L,383
...,...,...,...,...,...
3836743,24393019,24393056,24393039,X,37
3836744,24392903,24393106,24393044,X,203
3836745,24370076,24393107,24393048,X,23031
3836746,24393056,24393107,24393059,X,51


In [79]:
f2haps['dist_bin'] = pd.cut(f2haps['distance'], 4)

In [81]:
f2haps['dist_bin_factor'] = pd.factorize(f2haps['dist_bin'])[0]