In [16]:
import os
import collections as col
import importlib

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.lines as lines

plot_aux_module = '/home/local/work/code/github/project-diploid-assembly/notebooks/aux_mods/plot_aux.py'
plot_aux_spec = importlib.util.spec_from_file_location("plot_aux", plot_aux_module)
plot_aux = importlib.util.module_from_spec(plot_aux_spec)
plot_aux_spec.loader.exec_module(plot_aux)

samples = plot_aux.load_sample_table()
hexcodes, rgbcodes, popmap = plot_aux.load_population_annotation()

save_plot = False
print_stats = False
dump_minimal_summary = False
dump_tableS9_columns = True
make_test = False
dump_length_stats = False
rel_hap_n50 = False

prop = plot_aux.get_plot_property

pipeline_version = 'v12'

# color_version
# - full
# - super_pop
# - red_black
color_version = 'red_black'

if color_version == 'full' or color_version == 'super_pop':
    rgbcodes['HiFi'] = tuple(map(lambda x: round(x/255, 2), (35, 35, 35)))
    rgbcodes['CLR'] = tuple(map(lambda x: round(x/255, 2), (35, 35, 35)))
elif color_version == 'red_black':
    rgbcodes['HiFi'] = plot_aux.get_platform_color('HiFi')
    rgbcodes['CLR'] = plot_aux.get_platform_color('CLR')
else:
    raise

stats_path = '/home/local/work/data/hgsvc/fig1_panels/busco_stats'
out_path = os.path.join(stats_path, 'quast_busco_table.tsv')
cache_file = os.path.join(stats_path, 'cache_{}.h5'.format(pipeline_version))

# value reported by QUAST (in "quast.log")
# features of type "gene" in GENCODE v31 basic
total_gencode_genes = 60603

keep_infos = {
    4: 'num_contigs_all',
    10: 'total_length_bp',
    16: 'num_contigs_geq_3kb',
    17: 'largest_contig_bp',
    18: 'total_length_geq3kb_bp',
    20: 'GC_pct',
    22: 'N50',
    23: 'NG50',
    24: 'N75',
    25: 'NG75',
    30: 'num_misassemblies',
    40: 'ref_genome_fraction_pct',
    41: 'dup_ratio',
    42: 'Nper100kbp_ratio',
    45: 'gencode_genes',
    46: 'busco_comp_pct',
    47: 'busco_part_pct',
    48: 'largest_alignment_bp',
    49: 'total_aligned_length_bp'
}

def load_assembly_stats(file_path):
    assm_stats = dict()
    with open(file_path, 'r') as table:
        for ln, line in enumerate(table, start=1):
            if ln not in keep_infos:
                continue
            key = keep_infos[ln]
            parts = line.strip().split()
            if key == 'gencode_genes':
                key_complete = key + '_comp_num'
                key_partial = key + '_part_num'
                complete_genes = int(parts[-4])
                partial_genes = int(parts[-2])
                assm_stats[key_complete] = complete_genes
                assm_stats[key_partial] = partial_genes
                cmp_genes_pct = round(complete_genes / total_gencode_genes * 100, 2)
                part_genes_pct = round(partial_genes / total_gencode_genes * 100, 2)
                assm_stats[key_complete.replace('_num', '_pct')] = cmp_genes_pct
                assm_stats[key_partial.replace('_num', '_pct')] = part_genes_pct
                continue
            to_num = int
            if any(key.endswith(x) for x in ['pct', 'ratio']):
                to_num = float
            try:
                assm_stats[key] = to_num(parts[-1])
            except ValueError:
                raise ValueError('Cannot convert number: {} / {} / {}'.format(ln, line.strip(), file_path))
    return assm_stats


def extract_assembly_type(file_name):
    assmt = None
    if '_nhr-' in file_name:
        assmt = 'NHR', 0
    elif 'h1-un' in file_name:
        assmt = 'HAP', 10
    elif 'h2-un' in file_name:
        assmt = 'HAP', 20
    else:
        raise ValueError('ASSMT: {}'.format(file_name))
    if assmt is None:
        raise ValueError('ASSMT: {}'.format(file_name))
    return assmt


if not os.path.isfile(cache_file):
    stat_files = plot_aux.load_plot_data_files(
        stats_path,
        '.txt',
        pipeline_version)
    plot_data = [load_assembly_stats(fp) for fp in stat_files]

    sample_platform = [plot_aux.extract_sample_platform(os.path.basename(f)) for f in stat_files]
    assembly_types = [extract_assembly_type(os.path.basename(f)) for f in stat_files]
    row_index = []
    for (sample, platform), (assmt, hap) in zip(sample_platform, assembly_types):
        super_pop = samples[sample]['super_population']
        pop = samples[sample]['population']
        row_index.append((sample, super_pop, pop, platform, assmt, hap))

    df = pd.DataFrame(
        plot_data,
        columns=sorted(plot_data[0].keys()),
        index=row_index
    )

    df.index = pd.MultiIndex.from_tuples(
        df.index.values,
        names=['sample', 'super_pop', 'pop', 'platform', 'assembly', 'hap']
    )
    df.to_hdf(cache_file, key='cache', mode='w', format='fixed')
    
df = pd.read_hdf(cache_file, 'cache')
df.sort_index(axis=0, inplace=True, level=['sample', 'super_pop', 'pop', 'platform', 'assembly', 'hap'])        

errors, missing = plot_aux.check_cache_consistency(df)
if errors:
    raise ValueErrors('Skip sample(s) in cache: {}'.format(errors))
if missing:
    print(sorted(missing))
    print('---------------')

if print_stats:
    print(df['gencode_genes_comp_pct'].describe())
    print(df['gencode_genes_part_pct'].describe())
    print(df['busco_comp_pct'].describe())
    print(df['busco_part_pct'].describe())
    print(df['Nper100kbp_ratio'].describe())

columns = [
    'busco_comp_pct',
    'busco_part_pct',
    'gencode_genes_comp_pct',
    'gencode_genes_part_pct',
    'Nper100kbp_ratio',
    'GC_pct',
]
summary_header = [
    'Technology',
    'BUSCO (complete, %)',
    'BUSCO (partial, %)',
    'GENCODEv31 genes (complete, %)',
    'GENCODEv31 genes (partial, %)',
    'N per 100 kbp',
    'GC content (%)',
]

    
if dump_minimal_summary:

    mini_summary = []
    mini_summary.append('\t'.join(summary_header))
    for selector in ['CLR', 'HiFi']:
        sub = df.xs(selector, level='platform', axis=0)
        sub = sub[columns]
        stat_line = [selector]
        for avg, med in zip(sub.mean(axis=0), sub.median(axis=0)):
            avg = round(avg, 1)
            med = round(med, 1)
            stat_line.append('{} / {}'.format(avg, med))
        mini_summary.append('\t'.join(stat_line))
    with open('/home/ebertp/mini_summary.tsv', 'w') as dump:
        dump.write('\n'.join(mini_summary) + '\n')
        
if dump_tableS9_columns:
    sub = df[columns]
    sub.sort_index(level=['platform', 'sample', 'hap'], ascending=[True, True, True], inplace=True)
    out_tsv = '/home/ebertp/tableS9_ext.tsv'
    sub.to_csv(
        out_tsv,
        sep='\t',
        index=True,
        header=True
    )
    

raise

raise RuntimeError('Code beyond this point expects data on the '
                   'non-haplotype resolved (NHR) assemblies, which is '
                   'not available for the QUAST/BUSCO evaluation runs.')

def plot_assembly_contiguity(sample_stats):
        
    nhr_assm = sample_stats.xs('NHR', level='assembly', drop_level=False)
    
    x_vals = col.defaultdict(list)
    y_vals = col.defaultdict(list)
    colors = col.defaultdict(list)
    
    for idx, row in nhr_assm.iterrows():
        platform = idx[3]
        population = idx[2]
        super_pop = idx[1]
        
        sqa_n50 = round(row['N50'] / 1e6, 2)
        x_vals[platform].append(sqa_n50)
        
        hap1_index = (idx[0], idx[1], idx[2], idx[3], 'HAP', 10)
        hap1_n50 = round(sample_stats.loc[hap1_index, 'N50'] / 1e6, 2)
        if rel_hap_n50:
            hap1_n50 = round(hap1_n50 / sqa_n50, 2)

        y_vals[(platform, 'HAP1')].append(hap1_n50)
        
        hap2_index = (idx[0], idx[1], idx[2], idx[3], 'HAP', 20)
        hap2_n50 = round(sample_stats.loc[hap2_index, 'N50'] / 1e6, 2)
        if rel_hap_n50:
            hap2_n50 = round(hap2_n50 / sqa_n50, 2)

        y_vals[(platform, 'HAP2')].append(hap2_n50)
        
        if color_version == 'full':
            colors[platform].append(rgbcodes[population])
        elif color_version == 'super_pop':
            colors[platform].append(rgbcodes[super_pop])
        elif color_version == 'red_black':
            colors[platform].append(rgbcodes[platform])
        else:
            raise
        
    
    fig, axis = plt.subplots(figsize=(8,8))
    
    for platform in plot_aux.get_sequencing_platforms():
        for hap in ['HAP1', 'HAP2']:
            nhr_n50 = x_vals[platform]
            hap_n50 = y_vals[(platform, hap)]
            marker_colors = colors[platform]

            axis.scatter(
                nhr_n50,
                hap_n50,
                s=prop('plot_marker_size'),
                c=colors[platform],
                marker=prop('{}_marker'.format(platform))
            )
    
    axis.set_xlabel(
        'Squashed assembly contig N50 (Mbp)',
        fontsize=prop('fontsize_legend')
    )
    if rel_hap_n50:
        axis.set_ylabel(
            'Haploid assembly contig N50 (relative units)',
            fontsize=prop('fontsize_legend')
        )
    else:
        axis.set_ylabel(
            'Haploid assembly contig N50 (Mbp)',
            fontsize=prop('fontsize_legend')
        )
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)
    
    axis.tick_params(
        axis='both',
        which='major',
        labelsize=prop('fontsize_axis_ticks')
    )
    
    # build custom legend
    custom_lines = [
        lines.Line2D(
            [0], [0],
            color=rgbcodes['CLR'],
            markersize=prop('legend_marker_size'),
            marker=prop('CLR_marker'),
            ls='None',
            label='CLR'
        ),
        lines.Line2D(
            [0], [0],
            color=rgbcodes['HiFi'],
            markersize=prop('legend_marker_size'),
            marker=prop('HiFi_marker'),
            ls='None',
            label='HiFi'
        )   
    ]
    
    axis.legend(
        handles=custom_lines,
        loc='upper left',
        prop={'size': prop('fontsize_legend')}
    )
    
    axis.plot(
        [0, 1],
        [0, 1],
        color='darkgrey',
        linestyle='dotted',
        lw='2',
        transform=axis.transAxes
    )
    axis.set_xlim(10, 40)
    axis.set_ylim(10, 40)

    if missing:
        plot_aux.add_incomplete_stamp(axis, 0.5, 0.85)
    
    extra_artists = []
    
    return fig, extra_artists
        
fig, exart = plot_assembly_contiguity(df)


if save_plot:
    if color_version == 'red_black':
        out_path += '_rb'
    if color_version == 'super_pop':
        out_path += '_spop'
    if rel_hap_n50:
        output += '_rel'
    fig.savefig(out_path + '.png', dpi=600, bbox_inches='tight', extra_artists=exart)
    fig.savefig(out_path + '_lowres.png', dpi=150, bbox_inches='tight', extra_artists=exart)
    fig.savefig(out_path + '.svg', bbox_inches='tight', extra_artists=exart)

if dump_length_stats:
    length_stats = os.path.join(stats_path, 'assembly_lengths.tsv')
    df.sort_index(axis=0, level=['platform', 'sample', 'super_pop', 'pop', 'hap'], inplace=True)
    df.to_csv(
        length_stats,
        sep='\t',
        header=True,
        index=True,
        columns=['total_length_bp'],
        mode='w'
    )
    
if print_stats:
    print('=== STATS SUMMARY ===')
    nhr_all = df.xs('NHR', level='assembly', drop_level=False)
    print('NHR ALL stats')
    print(nhr_all['N50'].describe())
    print('==========')
    nhr_clr = df.xs(['CLR', 'NHR'], level=['platform', 'assembly'], drop_level=False)
    print('NHR CLR stats')
    print(nhr_clr['N50'].describe())
    print('% > 35: ', round((nhr_clr['N50'] > 35e6).sum() / nhr_clr.shape[0] * 100, 1))
    print('% > 30: ', round((nhr_clr['N50'] > 30e6).sum() / nhr_clr.shape[0] * 100, 1))
    print('==========')
    nhr_hifi = df.xs(['HiFi', 'NHR'], level=['platform', 'assembly'], drop_level=False)
    print('NHR HiFi stats')
    print(nhr_hifi['N50'].describe())
    print('% > 35: ', round((nhr_hifi['N50'] > 35e6).sum() / nhr_hifi.shape[0] * 100, 1))
    print('% > 30: ', round((nhr_hifi['N50'] > 30e6).sum() / nhr_hifi.shape[0] * 100, 1))
    print('==========')

    print('==========\n=========')
    hap_all = df.xs('HAP', level='assembly', drop_level=False)
    print('HAP ALL stats')
    print(hap_all['N50'].describe())
    print('==========')
    hap_clr = df.xs(['CLR', 'HAP'], level=['platform', 'assembly'], drop_level=False)
    print('================')
    print('HAP CLR stats')
    print(hap_clr['N50'].describe())
    print('% > 35: ', (hap_clr['N50'] > 35e6).sum() / hap_clr.shape[0] * 100)
    print('% > 33: ', (hap_clr['N50'] > 33e6).sum() / hap_clr.shape[0] * 100)
    print('% > 32: ', (hap_clr['N50'] > 32e6).sum() / hap_clr.shape[0] * 100)
    print('% > 30: ', (hap_clr['N50'] > 30e6).sum() / hap_clr.shape[0] * 100)
    print('# above 35 Mbp: ', (hap_clr['N50'] > 35e6).sum())
    print('# above 33 Mbp: ', (hap_clr['N50'] > 33e6).sum())
    print('# above 32 Mbp: ', (hap_clr['N50'] > 32e6).sum())
    print('# above 30 Mbp: ', (hap_clr['N50'] > 30e6).sum())
    print('==========')
    hap_hifi = df.xs(['HiFi', 'HAP'], level=['platform', 'assembly'], drop_level=False)
    print('================')
    print('HAP HiFi stats')
    print(hap_hifi['N50'].describe())
    print('% > 35: ', (hap_hifi['N50'] > 35e6).sum() / hap_hifi.shape[0] * 100)
    print('% > 33: ', (hap_hifi['N50'] > 33e6).sum() / hap_hifi.shape[0] * 100)
    print('% > 32: ', (hap_hifi['N50'] > 32e6).sum() / hap_hifi.shape[0] * 100)
    print('% > 30: ', (hap_hifi['N50'] > 30e6).sum() / hap_hifi.shape[0] * 100)
    print('==========')
    print('HAP HiFi non-AFR')
    non_afr_index = hap_hifi.index.get_level_values('super_pop').isin(['AMR', 'EAS', 'EUR', 'SAS'])
    hap_hifi_non_afr = hap_hifi.loc[non_afr_index, :].copy()
    print(hap_hifi_non_afr['N50'].describe())
    
if make_test:
    from scipy.stats import ttest_ind as ttest
    from scipy.stats import bartlett
    
    hifi_hap = df.xs(['HAP', 'HiFi'], level=['assembly', 'platform'])
    print(hifi_hap.shape)
    clr_hap = df.xs(['HAP', 'CLR'], level=['assembly', 'platform'])
    print(clr_hap.shape)
    
    hifi_n50 = hifi_hap['N50']
    clr_n50 = clr_hap['N50']
    
    print(bartlett(hifi_n50, clr_n50))
    
    print(hifi_n50.var())
    print(clr_n50.var())
    
    t_statistic, t_pv = ttest(
        hifi_n50,
        clr_n50,
        equal_var=False
    )
    print('N HiFi: ', hifi_hap.shape[0])
    print('N CLR: ', clr_hap.shape[0])
    print(round(t_statistic, 3))
    print(t_pv)
    print(t_pv < 10e-9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


RuntimeError: No active exception to reraise