In [5]:
import os
import collections as col
import importlib

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.lines as lines

plot_aux_module = '/home/local/work/code/github/project-diploid-assembly/notebooks/aux_mods/plot_aux.py'
plot_aux_spec = importlib.util.spec_from_file_location("plot_aux", plot_aux_module)
plot_aux = importlib.util.module_from_spec(plot_aux_spec)
plot_aux_spec.loader.exec_module(plot_aux)

samples = plot_aux.load_sample_table()
hexcodes, rgbcodes, popmap = plot_aux.load_population_annotation()
prop = plot_aux.get_plot_property
pipeline_version = 'v12'

stats_path = '/home/local/work/data/hgsvc/figSX_panels/hapreg_jaccard/hap_regs_jaccard.tsv'
out_path = os.path.join(stats_path, pipeline_version, 'figSX_panel_hapreg_jaccard')
cache_file = os.path.join(stats_path, 'cache_{}.h5'.format(pipeline_version))


def load_jaccard_stats(file_path):
    df = pd.read_csv(file_path, index_col=False, sep='\t')
    df['comparison'] = 0
    df['marker'] = 's'
    df['color'] = '#000000'
    df['family_a'] = df['sample_a'].map(lambda x: samples[x]['family'])
    df['family_b'] = df['sample_b'].map(lambda x: samples[x]['family'])
    df['super_a'] = df['sample_a'].map(lambda x: samples[x]['super_population'])
    df['super_b'] = df['sample_b'].map(lambda x: samples[x]['super_population'])
    
    # comparison HiFi vs CLR (same sample)
    select_same_sample = df['sample_a'] == df['sample_b']
    select_diff_tech = df['platform_a'] != df['platform_b']
    df.loc[select_same_sample & select_diff_tech, 'comparison'] = 1
    
    # comparison parent vs child
    select_diff_sample = df['sample_a'] != df['sample_b']
    select_same_family = df['family_a'] == df['family_b']
    df.loc[select_diff_sample & select_same_family, 'comparison'] = 2
    
    # comparison hap1 vs hap2
    select_diff_hap = df['haplotype_a'] != df['haplotype_b']
    df.loc[select_same_sample & select_diff_hap, 'comparison'] = 4
    
    # comparison "any other"
    df.loc[df['comparison'] == 0, 'comparison'] = 3
    
    # marker CLR
    select_both_clr = (df['platform_a'] == 'CLR') & (df['platform_b'] == 'CLR')
    df.loc[select_both_clr, 'marker'] = prop('CLR_marker')
    
    # marker HiFi
    select_both_hifi = (df['platform_a'] == 'HiFi') & (df['platform_b'] == 'HiFi')
    df.loc[select_both_hifi, 'marker'] = prop('HiFi_marker')
    
    # assign color if both samples come from same super pop
    df.loc[df['super_a'] == df['super_b'], 'color'] = \
        df.loc[df['super_a'] == df['super_b'], 'super_a'].map(lambda x: hexcodes[x])

    return df


def extract_assembly_type(file_name):
    assmt = None
    if '_nhr-' in file_name:
        assmt = 'NHR', 0
    elif 'h1-un' in file_name:
        assmt = 'HAP', 1
    elif 'h2-un' in file_name:
        assmt = 'HAP', 2
    else:
        raise ValueError('ASSMT: {}'.format(file_name))
    if assmt is None:
        raise ValueError('ASSMT: {}'.format(file_name))
    return assmt


if not os.path.isfile(cache_file):
    stat_files = plot_aux.load_plot_data_files(
        stats_path,
        '.txt',
        pipeline_version)
    plot_data = [load_assembly_stats(fp) for fp in stat_files]

    sample_platform = [plot_aux.extract_sample_platform(os.path.basename(f)) for f in stat_files]
    assembly_types = [extract_assembly_type(os.path.basename(f)) for f in stat_files]
    row_index = []
    for (sample, platform), (assmt, hap) in zip(sample_platform, assembly_types):
        super_pop = samples[sample]['super_population']
        pop = samples[sample]['population']
        row_index.append((sample, super_pop, pop, platform, assmt, hap))

    df = pd.DataFrame(
        plot_data,
        columns=sorted(plot_data[0].keys()),
        index=row_index
    )

    df.index = pd.MultiIndex.from_tuples(
        df.index.values,
        names=['sample', 'super_pop', 'pop', 'platform', 'assembly', 'hap']
    )
    df.to_hdf(cache_file, key='cache', mode='w', format='fixed')
    
df = pd.read_hdf(cache_file, 'cache')
df.sort_index(axis=0, inplace=True, level=['sample', 'super_pop', 'pop', 'platform', 'assembly', 'hap'])        

    
def plot_assembly_contiguity(sample_stats):
    
    nhr_assm = sample_stats.xs('NHR', level='assembly', drop_level=False)
    
    x_vals = col.defaultdict(list)
    y_vals = col.defaultdict(list)
    colors = col.defaultdict(list)
    
    for idx, row in nhr_assm.iterrows():
        platform = idx[3]
        population = idx[2]
        
        x_vals[platform].append(round(row['N50'] / 1e6, 2))
        
        hap1_index = (idx[0], idx[1], idx[2], idx[3], 'HAP', 1)
        y_vals[(platform, 'HAP1')].append(round(sample_stats.loc[hap1_index, 'N50'] / 1e6, 2))
        
        hap2_index = (idx[0], idx[1], idx[2], idx[3], 'HAP', 2)
        y_vals[(platform, 'HAP2')].append(round(sample_stats.loc[hap2_index, 'N50'] / 1e6, 2))
        
        colors[platform].append(rgbcodes[population])
        
    
    fig, axis = plt.subplots(figsize=(8,8))
    
    for platform in plot_aux.get_sequencing_platforms():
        for hap in ['HAP1', 'HAP2']:
            nhr_n50 = x_vals[platform]
            hap_n50 = y_vals[(platform, hap)]
            marker_colors = colors[platform]

            axis.scatter(
                nhr_n50,
                hap_n50,
                s=prop('plot_marker_size'),
                c=colors[platform],
                marker=prop('{}_marker'.format(platform))
            )

    axis.set_xlabel(
        'Squashed assembly contig N50 (Mbp)',
        fontsize=prop('fontsize_legend')
    )
    axis.set_ylabel(
        'Haploid assembly contig N50 (Mbp)',
        fontsize=prop('fontsize_legend')
    )
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)
    
    axis.tick_params(
        axis='both',
        which='major',
        labelsize=prop('fontsize_axis_ticks')
    )
    
    
    # build custom legend
    custom_lines = [
        lines.Line2D(
            [0], [0],
            color='black',
            markersize=prop('legend_marker_size'),
            marker=prop('CLR_marker'),
            ls='None',
            label='CLR'
        ),
        lines.Line2D(
            [0], [0],
            color='black',
            markersize=prop('legend_marker_size'),
            marker=prop('HiFi_marker'),
            ls='None',
            label='HiFi'
        )   
    ]
    
    axis.legend(
        handles=custom_lines,
        loc='lower right',
        prop={'size': prop('fontsize_legend')}
    )

    if missing:
        plot_aux.add_incomplete_stamp(axis, 0.5, 0.85)
    
    extra_artists = []
    
    return fig, extra_artists
        
fig, exart = plot_assembly_contiguity(df)
    
#fig.savefig(out_path + '.png', dpi=600, bbox_inches='tight', extra_artists=exart)
#fig.savefig(out_path + '.svg', bbox_inches='tight', extra_artists=exart)


(0.0, 0.0, 0.0)
(1.0, 1.0, 1.0)
(0.5333333333333333, 0.5333333333333333, 0.5333333333333333)
  assembly_a assembly_b  haplotype_a  haplotype_b  intersect_bp  \
0        HAP        HAP            1            2       2609354   
1        HAP        HAP            1            2       2191934   
2        HAP        HAP            1            1       2244686   
3        HAP        HAP            1            1       2645605   
4        HAP        HAP            1            2         14658   

   intersect_count   jaccard platform_a platform_b sample_a sample_b  \
0             2271  0.092653        CLR        CLR  HG00096  HG00171   
1             1930  0.080305        CLR       HiFi  HG00096  HG00512   
2             2059  0.081590        CLR       HiFi  HG00096  HG00512   
3             2286  0.095327        CLR        CLR  HG00096  HG00171   
4               34  0.000474        CLR        CLR  HG00096  HG00096   

   union_bp  comparison marker    color family_a family_b super_a super

RuntimeError: No active exception to reraise