In [1]:
import pandas as pd
import zarr
import numpy as np
import plotly.express as px

In [2]:
def add_zero_ibd_pairs(dfs_ibd, df_samples, fillna=0):

    from itertools import combinations

    dfs = [list(combinations(df_samples.query("sample_set == @s")['sample_id'], 2)) for s in sample_sets]

    comps = pd.concat([pd.DataFrame(d) for d in dfs], axis=0)
    comps.columns = ['iid1', 'iid2']

    def normalize_pairs(df, col1, col2):
        df['normalized_iid1'] = df[[col1, col2]].min(axis=1)
        df['normalized_iid2'] = df[[col1, col2]].max(axis=1)
        return df

    # Apply normalization
    df_ibd_summary = normalize_pairs(dfs_ibd, 'iid1', 'iid2')
    comps = normalize_pairs(comps, 'iid1', 'iid2')

    df_ibd_summary = comps.merge(df_ibd_summary, how='left')

    # Fill missing values with 0
    df_ibd_summary.fillna({
        'max_IBD': fillna,
        'sum_IBD>8': 0,
        'n_IBD>8': 0,
        'sum_IBD>12': 0,
        'n_IBD>12': 0,
        'sum_IBD>16': 0,
        'n_IBD>16': 0,
        'sum_IBD>20': 0,
    }, inplace=True)


    df_ibd_summary = df_ibd_summary.merge(df_samples, left_on='iid1', right_on='sample_id').merge(df_samples, left_on='iid2', right_on='sample_id')
    return df_ibd_summary

### Exploring IBD segments in ag3

In [3]:
import glob

In [4]:
sample_sets1 = pd.Series(glob.glob("../../ag3_results/ibd/*/ch2RL.tsv")).str.split("/").str.get(-2).to_list()
sample_sets2 = pd.Series(glob.glob("../../ag3_results/ibd/*/ch3RL.tsv")).str.split("/").str.get(-2).to_list()

In [5]:
sample_sets = [item for item in sample_sets1 if item in sample_sets2]

In [6]:
from ancIBD.IO.ind_ibd import combine_all_chroms

for sample_set in sample_sets:
    combine_all_chroms(chs=('2RL', '3RL'),
                       folder_base=f'../../ag3_results/ibd/{sample_set}/ch',
                       path_save=f'../../ag3_results/ibd/{sample_set}/ibd.tsv'
                      )

Chromosome 2RL; Loaded 125 IBD
Chromosome 3RL; Loaded 38 IBD
Saved 163 IBD to ../../ag3_results/ibd/AG1000G-CM-C/ibd.tsv.
Chromosome 2RL; Loaded 743 IBD
Chromosome 3RL; Loaded 204 IBD
Saved 947 IBD to ../../ag3_results/ibd/AG1000G-BF-A/ibd.tsv.
Chromosome 2RL; Loaded 104 IBD
Chromosome 3RL; Loaded 35 IBD
Saved 139 IBD to ../../ag3_results/ibd/AG1000G-GN-A/ibd.tsv.
Chromosome 2RL; Loaded 108 IBD
Chromosome 3RL; Loaded 61 IBD
Saved 169 IBD to ../../ag3_results/ibd/AG1000G-CF/ibd.tsv.
Chromosome 2RL; Loaded 66 IBD
Chromosome 3RL; Loaded 96 IBD
Saved 162 IBD to ../../ag3_results/ibd/AG1000G-GM-C/ibd.tsv.
Chromosome 2RL; Loaded 515 IBD
Chromosome 3RL; Loaded 661 IBD
Saved 1176 IBD to ../../ag3_results/ibd/AG1000G-MZ/ibd.tsv.
Chromosome 2RL; Loaded 520 IBD
Chromosome 3RL; Loaded 181 IBD
Saved 701 IBD to ../../ag3_results/ibd/AG1000G-CD/ibd.tsv.
Chromosome 2RL; Loaded 11 IBD
Chromosome 3RL; Loaded 11 IBD
Saved 22 IBD to ../../ag3_results/ibd/AG1000G-GQ/ibd.tsv.
Chromosome 2RL; Loaded 68 IBD
C

  df_ibds = pd.concat(df_ibds)


In [7]:
from ancIBD.IO.ind_ibd import create_ind_ibd_df

dfs = []
for sample_set in sample_sets:
    dfs.append(create_ind_ibd_df(ibd_data = f'../../ag3_results/ibd/{sample_set}/ibd.tsv',
                          min_cms = [8, 12, 16, 20], snp_cm = 1000, min_cm = 8, sort_col = 0,
                          savepath = f'../../ag3_results/ibd/{sample_set}/ibd_d220.tsv').assign(sample_set=sample_set
                                                                                            ))
    
dfs_ibd = pd.concat(dfs)

> 8 cM: 163/163
Of these with suff. SNPs per cM> 1000:               163/163
ch
2RL    125
3RL     38
Name: count, dtype: int64
Saved 107 individual IBD pairs to: ../../ag3_results/ibd/AG1000G-CM-C/ibd_d220.tsv
> 8 cM: 947/947
Of these with suff. SNPs per cM> 1000:               947/947
ch
2RL    743
3RL    204
Name: count, dtype: int64
Saved 904 individual IBD pairs to: ../../ag3_results/ibd/AG1000G-BF-A/ibd_d220.tsv
> 8 cM: 139/139
Of these with suff. SNPs per cM> 1000:               139/139
ch
2RL    104
3RL     35
Name: count, dtype: int64
Saved 124 individual IBD pairs to: ../../ag3_results/ibd/AG1000G-GN-A/ibd_d220.tsv
> 8 cM: 169/169
Of these with suff. SNPs per cM> 1000:               169/169
ch
2RL    108
3RL     61
Name: count, dtype: int64
Saved 159 individual IBD pairs to: ../../ag3_results/ibd/AG1000G-CF/ibd_d220.tsv
> 8 cM: 162/162
Of these with suff. SNPs per cM> 1000:               162/162
ch
3RL    96
2RL    66
Name: count, dtype: int64
Saved 156 individual IBD pairs t

In [8]:
import malariagen_data
import matplotlib.pyplot as plt
ag3 = malariagen_data.Ag3()
df_samples = ag3.sample_metadata(sample_sets)

                                     

Check that we now have the correct number of pairwise comparisons

In [None]:
df_samples = df_samples.assign(set_taxon=lambda x: x.sample_set + "-" + x.taxon)

from itertools import combinations

# add zero ibd pairs
df_ibd_summary = add_zero_ibd_pairs(dfs_ibd, df_samples, fillna=0).rename(columns={'max_IBD':'total_ibd_cm'})
# assign a taxon + sample set column
df_ibd_summary = df_ibd_summary.assign(set_taxon=[y if y == df_ibd_summary.set_taxon_x[i] else 'between-species' for i,y in enumerate(df_ibd_summary.set_taxon_y)])
# remove combos with few observations
df_ibd_summary = df_ibd_summary.groupby('set_taxon').filter(lambda g: len(g) >= 100)

assert df_ibd_summary.query("sample_set == 'AG1000G-GH'").shape[0] == len(list(combinations(df_samples.query("sample_set == 'AG1000G-GH'").sample_id, 2)))

In [None]:
print("plotting")
fig = px.box(
    df_ibd_summary.sort_values('set_taxon'),
    y='total_ibd_cm', 
    x='set_taxon',
    template='simple_white', 
    color='country_x', 
    title='total shared IBD within sample sets / taxon combo',
    height=700, 
    width=1000
)
fig

In [None]:
ibd_rate = df_ibd_summary.groupby('set_taxon').agg({'total_ibd_cm':'sum'}).assign(ibd_rate=lambda x: x.total_ibd_cm / df_ibd_summary.groupby('set_taxon').size())
ibd_rate = ibd_rate.reset_index()

fig = px.bar(
    ibd_rate.sort_values('set_taxon'),
    y='ibd_rate', 
    x='set_taxon',
    template='simple_white', 
    title='Distribution of total shared IBD between individuals in ag3',
    height=500, 
    width=800
)
fig

### Plottijng with bokeh

In [None]:
def plot_ibd_segments_contig(
    self,
    df,
    contig, 
    filter_cm=None,
    show=False,
    width=None,
    height=400
):
    from itertools import combinations
    import bokeh.plotting as bkplt
    import bokeh.models as bkmod
    import bokeh.layouts as bklay
    
    df = df.query("ch == @contig")
    
    if filter_cm:
        df = df.query("cm > @filter_cm")
        
    print("finding levels")
    level_df = df[['iid1', 'iid2']].drop_duplicates()
    # Add the 'level' column with values from 0 to 1
    level_df['level'] = np.linspace(0, 1, level_df.shape[0])
    # # Join with the original dataframe
    df = df.merge(level_df, on=['iid1', 'iid2'], how='left')
    
#     print("colour mapping")
#     colour_mapping = {'half-ibd':'gray', 
#                       'full-ibd':'blue'}
#     colour = df['ibd_type'].apply(lambda x: colour_mapping[x])

    source = bkmod.ColumnDataSource(data={
         'sample_id1':df['iid1'].to_numpy(),
         'sample_id2':df['iid2'].to_numpy(),
        'chromosome': df['ch'].to_numpy(),
        'start': df['StartBP'].to_numpy(),
        'end': df['EndBP'].to_numpy(),
        'cm_length': df['lengthM'].to_numpy(),
        'bottoms':df['level'].to_numpy(),
        'tops': df['level'].to_numpy()+0.0001,
#         'colour': colour.to_numpy()
    })

    hover = bkmod.HoverTool(tooltips=[
             ("sample_id1", '@sample_id1'),
             ("sample_id2", '@sample_id2'),
             ('cm_length', '@cm_length'),
            ("segment span", "@start{,} - @end{,}"),
        ])
        
    print("making figure")
    if not width:
        width = int(self.genome_sequence(contig).shape[0]/200000)
    fig1 = bkplt.figure(title=contig,
                        width=width,
                        height=500, 
                        tools="tap,box_zoom,xpan,xzoom_in,xzoom_out,xwheel_zoom,reset".split() + [hover],
                        toolbar_location='above', active_drag='xpan', active_scroll='xwheel_zoom',
                        output_backend="webgl")

    glyph = bkmod.Quad(left='start', right='end', bottom='bottoms', top='tops', line_color="grey", line_alpha=.8, line_width=1)
    fig1.add_glyph(source, glyph)

    fig1.x_range = bkmod.Range1d(0, self.genome_sequence(contig).shape[0], bounds='auto')
    fig1.y_range = bkmod.Range1d(0, 1, bounds='auto')
    fig1.x_range.max_interval = self.genome_sequence(contig).shape[0]
    fig1.yaxis.visible = False
    fig1.ygrid.visible = False
    _bokeh_style_genome_xaxis(fig1, contig)
    
    if show:
        bkplt.show(fig1)
    
    return fig1

def plot_ibd_segments(
        self,
        df, 
        out_dir,
        cohort_id,
        contigs=('2RL', '3RL', 'X'),
        filter_cm=None,
        show=True,
        title=None,
        height=1000
    ):
    import bokeh.models as bkmod
    import bokeh.layouts as bklay
    import bokeh.plotting as bkplt
    from tqdm.notebook import tqdm
    
    figs = [
            plot_ibd_segments_contig(
                self=self,
                df=df,
                contig=contig,
                filter_cm=filter_cm,
                ) 
            for contig in tqdm(contigs)
            ]
    
    if out_dir:
        bkplt.output_file(filename=out_dir + cohort_id + "_segments.html", title=title)

    fig = bklay.gridplot(
        figs,
        ncols=len(contigs),
        toolbar_location="above",
        merge_tools=True,
        height=height
    ) 
    
    if out_dir:
        bkplt.save(fig)
    
    if show:
        bkplt.show(fig)
    
    return fig
    
def _bokeh_style_genome_xaxis(fig, contig):
    import bokeh.models as bkmod
    """Standard styling for X axis of genome plots."""
    fig.xaxis.axis_label = f"Contig {contig} position (bp)"
    fig.xaxis.ticker = bkmod.AdaptiveTicker(min_interval=1)
    fig.xaxis.minor_tick_line_color = None
    fig.xaxis[0].formatter = bkmod.NumeralTickFormatter(format="0,0")

In [None]:
# for cr in df_crosses.cross.unique()[:1]:
#     df = df_crosses.query("cross == @cr").dropna()
#     father, mother = df.iloc[0, [2,3]]
#     sibs = df.sample_id
    
#     df2  = df_ibd.query("iid1 in @sibs and iid2 in @sibs")

fig = plot_ibd_segments(
        self=ag3, 
        df=df_ibd.sort_values('relationship'),
        filter_cm=10,
        out_dir=None,
        show=True,
        cohort_id='AG1000G-X',
        contigs=('2RL', '3RL'), 
        height=1200
    )