In [5]:
import pandas as pd
import os as os
import io
import collections as col
import numpy
import operator as op
import numpy as np
import numpy.ma as ma
import importlib

"""
What does this do?
Read the output of a "bedtools intersect" operation for all haploid assemblies
(MAPQ60) and the ROI file (e.g., HLA, 3q29), see "recomp_cov.py" script lines ~66ff
Extract assembly infos and compute statistics:
- number of contigs
- size/number of gaps
- coverage (bp, pct)
etc.
Results are used for creating Supp. Table(s); produces also output for condensed
coverage plots (so far not used in manuscript), see below
"""

"""
Input / Output

/home/local/work/data/hgsvc/roi/MAPQ-ALL_roi_assm_overlaps.tsv

/home/local/work/data/hgsvc/roi/roi_HLA_3q29_blocks.tsv
/home/local/work/data/hgsvc/roi/roi_assembly_summary_{ROI}.tsv
"""

plot_aux_module = '/home/local/work/code/github/project-diploid-assembly/notebooks/aux_mods/plot_aux.py'
plot_aux_spec = importlib.util.spec_from_file_location("plot_aux", plot_aux_module)
plot_aux = importlib.util.module_from_spec(plot_aux_spec)
plot_aux_spec.loader.exec_module(plot_aux)

ann_path = '/home/local/work/code/github/project-diploid-assembly/annotation'
st = os.path.join(ann_path, 'sample_table.tsv')
pt = os.path.join(ann_path, '1kg_hgsvc_colors.csv')
samples = plot_aux.load_sample_table(st)
hexcodes, rgbcodes, popmap = plot_aux.load_population_annotation(pt)

prop = plot_aux.get_plot_property

tsv_file = "/home/local/work/data/hgsvc/roi/MAPQ-ALL_roi_assm_overlaps.tsv"

tech_map = {
    'pbsq2-clr': 'CLR',
    'pbsq2-ccs': 'HiFi'
}

hap_map = {
    'h1-un': 'H1',
    'h2-un': 'H2'
}

def extract_assembly_info(assembly):
    
    filename = os.path.basename(assembly)
    parts = filename.split('_')
    sample = parts[0]
    tech = tech_map[parts[2]]
    hap = hap_map[parts[3].split('.')[1]]
    return sample, tech, hap
    

names = [
    'roi_chrom',
    'roi_start',
    'roi_end',
    'roi',
    'assembly',
    'qry_chrom',
    'qry_start',
    'qry_end',
    'contig_name',
    'mapq',
    'strand',
    'overlap'
]

df = pd.read_csv(tsv_file, sep='\t', header=None, names=names)
df = df.loc[df['mapq'] == 60, :].copy()
df.reset_index(drop=True, inplace=True)
df['roi_length'] = df['roi_end'] - df['roi_start']

assm_info = df['assembly'].apply(extract_assembly_info)
df2 = pd.DataFrame.from_records(
    assm_info,
    columns=['sample', 'platform', 'haplotype'],
    index=df.index
)

df = pd.concat([df, df2], ignore_index=False, axis=1)
df.drop(['assembly'], axis=1, inplace=True)

get_index_items = op.itemgetter(*tuple([12, 13, 14, 3]))

df.index = pd.MultiIndex.from_tuples(
    [get_index_items(t) for t in df.itertuples(index=None, name=None)],
    names=['sample', 'platform', 'haplotype', 'roi']
)
df.drop(['sample', 'platform', 'haplotype', 'roi'], axis=1, inplace=True)

summed_ovl = df.groupby(['sample', 'platform', 'haplotype', 'roi'])['overlap'].sum()
num_contigs = df.groupby(['sample', 'platform', 'haplotype', 'roi'])['contig_name'].nunique()

joined = pd.concat([summed_ovl, num_contigs], axis=1, ignore_index=False)
joined.rename({'overlap': 'overlap_bp', 'contig_name': 'contig_count'}, axis=1, inplace=True)

joined = joined.join(df, how='outer')

mrg_aligns = []
mrg_index = []

color_bars = []
grey = (160/255, 160/255, 160/255)
black = (0, 0, 0)

for (sample, platform, hap, roi), aligns in joined.groupby(['sample', 'platform', 'haplotype', 'roi']):
    roi_mask = np.zeros(aligns['roi_length'].values[0], dtype=np.bool)
    cov_mask = np.zeros(aligns['roi_length'].values[0], dtype=np.int8)
    
    roi_start = aligns['roi_start'].values[0]
    roi_end = aligns['roi_end'].values[0]
    roi_length = roi_end - roi_start
    roi_coords = np.arange(roi_start, roi_end)
    contig_count = aligns['contig_count'].values[0]
    
    for idx, aln in aligns.iterrows():
        start = max(roi_start, aln['qry_start'])
        end = min(roi_end, aln['qry_end'])
        roi_mask[start-roi_start:end-roi_start] |= True
        cov_mask[start-roi_start:end-roi_start] += 1

    coverage = roi_mask.sum()

    # count gaps
    gaps = np.array([
        s.stop - s.start for s in 
        ma.clump_unmasked(ma.masked_array(roi_coords, mask=roi_mask))
    ], dtype=np.int32)
    num_gaps = gaps.size
    if num_gaps > 0:
        min_gap = gaps.min()
        max_gap = gaps.max()
        avg_gap = int(gaps.mean())
    else:
        min_gap = 0
        max_gap = 0
        avg_gap = 0
        
    sample_color = rgbcodes[samples[sample]['super_population']]
    for cov, color in zip([0, 1, 2], [black, sample_color, grey]):
        if cov == 2:
            select_mask = cov_mask >= cov
            block_type = 'multi'
        else:
            if cov == 0:
                block_type = 'gap'
            else:
                block_type = 'align'
            select_mask = cov_mask == cov
        slices = ma.clump_masked(ma.masked_array(roi_coords, mask=select_mask))
        for s in slices:
            color_bars.append(
                (
                    sample,
                    platform,
                    hap,
                    roi,
                    coverage,
                    round(coverage / roi_length, 3),
                    contig_count,
                    roi_coords[s.start],
                    s.stop - s.start,
                    color
                )
            )
    
    mrg_aligns.append(
        (
            coverage,
            contig_count,
            roi_length - coverage,
            (coverage / roi_length * 100).round(2),
            num_gaps,
            min_gap,
            max_gap,
            avg_gap
        )
    )
    mrg_index.append((sample, platform, hap, roi))

aln_colors = pd.DataFrame(
    color_bars,
    columns=[
        'sample',
        'platform',
        'hap',
        'roi',
        'coverage',
        'cov_pct',
        'contigs',
        'block_start',
        'block_length',
        'block_color'
    ]
)

# this output file can be used to create a condensed representation
# of alignment coverage in the respective locus (colored by super pop)
aln_colors.to_csv(
    '/home/local/work/data/hgsvc/roi/roi_HLA_3q29_blocks.tsv',
    sep='\t',
    header=True,
    index=False
)


joined = pd.DataFrame(
    mrg_aligns,
    columns=[
        'coverage_bp',
        'contigs_count',
        'delta',
        'coverage_pct',
        'num_gaps',
        'min_gap',
        'max_gap',
        'avg_gap'
    ],
    index=pd.MultiIndex.from_tuples(
        mrg_index,
        names=['sample', 'platform', 'haplotype', 'roi']
    )
)

for roi in ['3q29', 'HLA']:
    print('################')
    print('====', roi)
    print('################')
    subset = joined.xs(roi, level='roi', drop_level=False).copy()
    subset.sort_values(['delta', 'coverage_pct', 'sample', 'platform', 'haplotype'],
                       inplace=True,
                       ascending=[True, False, True, True, True]
                      )
    subset['delta_percentile'] = (subset['delta'].rank(ascending=False, method='max', pct=True) * 100).round(2)
    
    print(subset['delta'].describe())
    print(subset['coverage_pct'].describe())
    print(subset['contigs_count'].describe())
    print(subset['num_gaps'].describe())
    
    out_table = '/home/local/work/data/hgsvc/roi/roi_assembly_summary_{}.tsv'.format(roi)
    subset.to_csv(out_table, sep='\t', header=True, index=True)

################
==== 3q29
################
count        88.000000
mean      55982.204545
std       79679.831758
min           0.000000
25%           0.000000
50%       21316.500000
75%       61824.000000
max      267120.000000
Name: delta, dtype: float64
count     88.000000
mean      86.665795
std       18.977665
min       36.380000
25%       85.272500
50%       94.925000
75%      100.000000
max      100.000000
Name: coverage_pct, dtype: float64
count    88.000000
mean      3.272727
std       1.785814
min       1.000000
25%       2.000000
50%       3.000000
75%       4.000000
max      12.000000
Name: contigs_count, dtype: float64
count    88.000000
mean      1.272727
std       1.058380
min       0.000000
25%       0.000000
50%       1.000000
75%       2.000000
max       4.000000
Name: num_gaps, dtype: float64
################
==== HLA
################
count        88.000000
mean      71476.113636
std       39075.355364
min           0.000000
25%       44552.000000
50%       81367.0000