In [5]:
import os
import collections as col
import importlib
import re

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.lines as lines

plot_aux_module = '/home/local/work/code/github/project-diploid-assembly/notebooks/aux_mods/plot_aux.py'
plot_aux_spec = importlib.util.spec_from_file_location("plot_aux", plot_aux_module)
plot_aux = importlib.util.module_from_spec(plot_aux_spec)
plot_aux_spec.loader.exec_module(plot_aux)

samples = plot_aux.load_sample_table()
hexcodes, rgbcodes, popmap = plot_aux.load_population_annotation()
prop = plot_aux.get_plot_property

save_plots = False
print_stats = True

pipeline_version = 'v12'

stats_path = '/home/local/work/data/hgsvc/figSX_panels/contig_stats'
out_path = os.path.join(stats_path, pipeline_version, 'figSX_contig_aln_stats')
cache_file = os.path.join(stats_path, 'cache_{}.h5'.format(pipeline_version))


def split_cluster_alignment_info(record, index):
    names = [
        'top{}_cluster_name',
        'top{}_cluster_length',
        'top{}_cluster_align_pct',
        'top{}_ref_align_pct'
    ]
    keys = [n.format(index) for n in names]
    cast_types = [str, int, float, float]
    if not isinstance(record, str):
        assert pd.isna(record), 'Error: {}'.format(record)
        values = 'no_align 0 0 0'.split()
    else:
        values = record.split('|')
    d = dict((k, t(v)) for k, v, t in zip(keys, values, cast_types))
    return d


def extract_assembly_haplotype(filename):
    
    if 'h1-un' in filename:
        assm, hap = 'HAP', 10
    elif 'h2-un' in filename:
        assm, hap = 'HAP', 20
    elif 'h1' in filename:
        assm, hap = 'HAP', 1
    elif 'h2' in filename:
        assm, hap = 'HAP', 2
    elif 'nhr' in filename:
        assm, hap = 'NHR', 0
    elif 'scV{}'.format(pipeline_version.strip('v')) in filename:
        assm, hap = 'NHR', 0
    else:
        raise ValueError('cannot extract assembly {}'.format(filename))
    return assm, hap


def load_contig_align_stats(path):
   
    index_infos = [
        'chrom',
        'sample',
        'population',
        'super_pop',
        'platform',
        'assembly',
        'hap'
    ]

    per_sample = []
    for stats_file in os.listdir(path):
        if not stats_file.endswith('mapq60.stats'):
            continue
        sample, platform = plot_aux.extract_sample_platform(
            stats_file,
            mapped_readset=True,
            long_read_pos=1
        )
        assm, hap = extract_assembly_haplotype(stats_file)
        file_path = os.path.join(path, stats_file)
        
        df = pd.read_csv(file_path, sep='\t')
        df.columns = [c.strip('#') for c in df.columns]
        
        top_alignments = []
        for i in range(1, 4):
            top_aln = df['top{}_alignment'.format(i)].apply(split_cluster_alignment_info, args=(i,))
            if len(top_alignments) > 0:
                [d.update(e) for d, e in zip(top_alignments, top_aln)]
            else:
                top_alignments = top_aln
            
        top_alignments = pd.DataFrame.from_records(top_alignments)
        df = df[['ref_seq', 'ref_length']]
        df = pd.concat([df, top_alignments], axis=1)
        df['sample'] = sample
        df['platform'] = platform
        df['super_pop'] = samples[sample]['super_population']
        df['population'] = samples[sample]['population']
        df['assembly'] = assm
        df['hap'] = hap
        df['chrom'] = df['ref_seq']
        df.drop('ref_seq', axis=1, inplace=True)
                
        # multiindex arrays
        idx_arrays = [df[i] for i in index_infos]
        idx_names = index_infos
        
        df.index = pd.MultiIndex.from_arrays(
            idx_arrays,
            names=idx_names
        )
        df.drop(index_infos, axis=1, inplace=True)
        per_sample.append(df)

    per_sample = pd.concat(per_sample, axis=0)
    
    return per_sample


if not os.path.isfile(cache_file):
    
    df = load_contig_align_stats(os.path.join(stats_path, 'clustered'))       
    df.to_hdf(cache_file, key='cache', mode='w', format='fixed')
    
df = pd.read_hdf(cache_file, 'cache')
     
main_chroms = ['chr' + str(i) for i in range(1, 23)]
main_chroms.append('chrX')

df = df.loc[df.index.isin(main_chroms, level='chrom'), :]

def plot_contig_coverage(sample_stats):
    
    clr_assm = sample_stats.xs(
        ['CLR', 'contig', 'cytoband', 'fraction'],
        level=['platform', 'input_type', 'region_type', 'stat_type'],
        drop_level=True
    ).copy()
    clr_cov = pd.DataFrame(clr_assm.groupby(['chrom', 'start', 'name'])['coverage'].mean())
    clr_cov['sort_order'] = clr_cov.index.get_level_values('chrom').map(lambda x: 23 if x == 'chrX' else int(x.strip('chr')))
    clr_cov.sort_values(['sort_order', 'start'], ascending=True, inplace=True)
    #clr_std = clr_assm.groupby(['chrom', 'start', 'name'])['coverage'].std()
                
    hifi_assm = sample_stats.xs(
        ['HiFi', 'contig', 'cytoband', 'fraction'],
        level=['platform', 'input_type', 'region_type', 'stat_type'],
        drop_level=True
    )
    hifi_cov = pd.DataFrame(hifi_assm.groupby(['chrom', 'start', 'name'])['coverage'].mean())
    hifi_cov['sort_order'] = hifi_cov.index.get_level_values('chrom').map(lambda x: 23 if x == 'chrX' else int(x.strip('chr')))
    hifi_cov.sort_values(['sort_order', 'start'], ascending=True, inplace=True)
    
    #hifi_std = hifi_assm.groupby(['chrom', 'start', 'name'])['coverage'].std()

    
    fig, axis = plt.subplots(figsize=(16,8))
    
    x_vals_clr = np.array(range(0, clr_cov.shape[0]*2, 2), dtype=np.float16)
    x_vals_hifi = x_vals_clr + 1

    axis.errorbar(
        x_vals_clr,
        clr_cov['coverage'].values,
        yerr=None, #clr_std.values,
        marker=prop('CLR_marker'),
        color='black',
        linestyle='',
        markersize=4
    )
    
    axis.errorbar(
        x_vals_hifi,
        hifi_cov['coverage'].values,
        yerr=None,#hifi_std.values,
        marker=prop('HiFi_marker'),
        color='red',
        linestyle='',
        markersize=4,
        alpha=0.75
    )
    
    x_ticks = []
    x_ticklabels = []
    vlines_boundary = []
    vlines_cen = []
    cytogenic_bands = clr_cov.index.get_level_values('name')
    last_chrom = cytogenic_bands[0].split('p')[0]
    last_arm = 'p'
    extract = re.compile('(?P<chrom>chr[0-9X]+)(?P<arm>(p|q))(?P<segment>[0-9\.]+)')
    for x_val, band_id in zip(x_vals_hifi, cytogenic_bands):
        mobj = extract.match(band_id)
        if mobj is None:
            raise ValueError(band_id)
        chrom = mobj.group('chrom')
        arm = mobj.group('arm')
        if chrom != last_chrom:
            vlines_boundary.append(x_val + 0.5)
            last_chrom = chrom
            last_arm = 'p'
            continue
        elif arm != last_arm:
            x_ticks.append(x_val + 0.5)
            x_ticklabels.append(chrom)
            vlines_cen.append(x_val + 0.5)
            last_arm = arm
        else:
            pass
    
    axis.vlines(vlines_boundary, 90, 105, color='black', linestyle='solid', zorder=0)
    axis.vlines(vlines_cen, 0, 100, color='dimgrey', linestyle='dotted', zorder=0)
    
    axis.set_xticks(x_ticks)
    axis.set_xticklabels(x_ticklabels, rotation=90)

    axis.set_xlabel(
        'Genomic location',
        fontsize=prop('fontsize_legend')
    )
    axis.set_ylabel(
        'Phased assembly contig coverage\n(average % bp)',
        fontsize=prop('fontsize_legend')
    )
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)
    
    axis.tick_params(
        axis='both',
        which='major',
        labelsize=prop('fontsize_axis_ticks')
    )
        
    # build custom legend
    custom_lines = [
        lines.Line2D(
            [0], [0],
            color='black',
            markersize=prop('legend_marker_size'),
            marker=prop('CLR_marker'),
            ls='None',
            label='CLR',
        ),
        lines.Line2D(
            [0], [0],
            color='red',
            markersize=prop('legend_marker_size'),
            marker=prop('HiFi_marker'),
            ls='None',
            label='HiFi',
        )   
    ]
    
    lgd = axis.legend(
        handles=custom_lines,
        prop={'size': prop('fontsize_legend')},
        bbox_to_anchor=(0.95, 0.75)
    )

    if missing:
        plot_aux.add_incomplete_stamp(axis, 0.5, 0.85)
    
    extra_artists = [lgd]
    
    return fig, extra_artists


if save_plots:
    fig, exart = plot_contig_coverage(df)
    fig.savefig(out_path + '.png', dpi=600, bbox_inches='tight', extra_artists=exart)
    fig.savefig(out_path + '.svg', bbox_inches='tight', extra_artists=exart)
    
if print_stats:
    print('=== STATS SUMMARY ===')
    print('= ALL')
    print(df['top1_cluster_align_pct'].describe())
    print('---')
    subset_clr = df.xs('CLR', level='platform')
    print('= CLR')
    print(subset_clr['top1_cluster_align_pct'].describe())
    print('---')
    subset_hifi = df.xs('HiFi', level='platform')
    print('= HiFi')
    print(subset_hifi['top1_cluster_align_pct'].describe())
    



=== STATS SUMMARY ===
= ALL
count    966.000000
mean      98.433230
std        4.568985
min       36.250000
25%       98.710000
50%       99.410000
75%       99.760000
max      100.550000
Name: top1_cluster_align_pct, dtype: float64
---
= CLR
count    690.000000
mean      98.972464
std        1.982397
min       75.690000
25%       98.982500
50%       99.490000
75%       99.830000
max      100.550000
Name: top1_cluster_align_pct, dtype: float64
---
= HiFi
count    276.000000
mean      97.085145
std        7.800947
min       36.250000
25%       97.757500
50%       99.095000
75%       99.562500
max       99.960000
Name: top1_cluster_align_pct, dtype: float64
