In [1]:
import allel
import phasedibd as ibd 
import malariagen_data
import pandas as pd 
import numpy as np

### IBD detection with phasedIBD

In [2]:
sample_sets = ['1244-VO-GH-YAWSON-VMF00149']
analysis = 'gamb_colu'

In [3]:
ag3 = malariagen_data.Ag3(pre=True, results_cache="../../malariagen_data_cache")

In [4]:
chromosomes = list(ag3.virtual_contigs) + ['X']

In [7]:
def tpbwt_ibd(region, analysis, sample_sets, sample_query=None, L_m=300, L_f=3, missing_site_threshold=10, segments_out_path=None, verbose=True):
    
    print("loading haplotypes")
    region = ag3.resolve_region(region)
    ds_haps = ag3.haplotypes(region=region, analysis=analysis, sample_sets=sample_sets, sample_query=sample_query)
    gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
    
    print("computing dask array")
    ht = gt.to_haplotypes().compute()
    pos = ds_haps['variant_position'].values
    
    print(f"loading haplotypes into tpbwt for {region}")
    haplotypes = ibd.HaplotypeAlignment(haplotype_array=ht.T, chromosomes=[region.contig])
    tpbwt = ibd.TPBWTAnalysis()
    
    print(f"computing ibd segments with templated positional burrows wheeler transform")
    ibd_df = tpbwt.compute_ibd(haplotypes, L_m=L_m, L_f=L_f, missing_site_threshold=missing_site_threshold, verbose=verbose, segments_out_path=segments_out_path)
    ibd_df = ibd_df.assign(start_bp=lambda x: pos[x.start_bp], end_bp=lambda x: pos[x.end_bp])
    ibd_df = ibd_df.assign(size=ibd_df.end_bp - ibd_df.start_bp)
    print("complete")

    return ibd_df

In [8]:
for contig in ag3.contigs:

    ibd_df = tpbwt_ibd(
        region=contig, 
        analysis="gamb_colu", 
        sample_sets=sample_sets,
        segments_out_path=f"../../results/ibd_segments_{contig}.tsv",
        L_f=30
    )


loading haplotypes
computing dask array
loading haplotypes into tpbwt for 2L
computing ibd segments with templated positional burrows wheeler transform

IBD compute for chromosome 2L...
Running in-sample TPBWT on 14810165 sites.
Number of haplotypes: 970
Minimum memory required: 0.0075272 Gb
Increasing size of segment vectors...
Increasing size of segment vectors...
Increasing size of segment vectors...
Increasing size of segment vectors...

Done computing IBD segments. Finishing up...
Number of IBD segments found = 121873367
Building final dataframe...


: 

: 

In [15]:
ibd_df

Unnamed: 0,chromosome,id1,id2,id1_haplotype,id2_haplotype,start,end,start_cm,end_cm,start_bp,end_bp,size
0,2L,968,969,0,0,0,7785,0.0,7785.0,1249,179152,177903
1,2L,966,969,0,0,0,7788,0.0,7788.0,1249,179158,177909
2,2L,962,969,0,0,0,7794,0.0,7794.0,1249,179172,177923
3,2L,958,969,0,0,0,7793,0.0,7793.0,1249,179171,177922
4,2L,954,969,0,0,0,7793,0.0,7793.0,1249,179171,177922
...,...,...,...,...,...,...,...,...,...,...,...,...
302794,2L,965,969,0,0,0,7793,0.0,7793.0,1249,179171,177922
302795,2L,966,967,0,0,8715,38345,8715.0,38345.0,185468,496807,311339
302796,2L,966,969,0,0,8677,38537,8677.0,38537.0,185343,497476,312133
302797,2L,967,968,0,0,8677,38537,8677.0,38537.0,185343,497476,312133


In [None]:
ibd_dfs = []

for chrom in ag3.contigs:
    
    ibd_df = tpbwt_ibd(
        contig=chrom, 
        analysis=analysis, 
        sample_sets=sample_sets,
        L_f=50_000
    )
    print("complete")
    ibd_dfs.append(ibd_df)

loading haplotypes
computing dask array
loading haplotypes into tpbwt for 2R
computing ibd segments with templated positional burrows wheeler transform

IBD compute for chromosome 2R...
