In [6]:
import os
import collections as col

import pandas as pd

"""
What does this do?
Compute simple concordance metric between Bionano and phased assemblies
for locus 3q29. Requires aligning identified segments back to hg38 / 3q29
(see recomp_cov.py script, lines ~154ff).

Output so far is just printing statistics
"""

bng_table = pd.read_csv(
    '/home/local/work/data/hgsvc/roi/20201109_3q29_bng_segments.flat.tsv',
    sep='\t',
    header=0
)
bed_path = '/home/local/work/data/hgsvc/roi/dotplots_chr3/chr3_alignments/contigs'

# select_sample = bng_table['sample'] == 'HG02011'
# select_tech = bng_table['platform'] == 'CLR'
# select_hap = bng_table['haplotype'] == 'H2'
# select_all = select_sample & select_tech & select_hap

# print(bng_table.loc[bng_table['segment_color']=='black', :])
# raise

# assm_contigs = bng_table.loc[bng_table['sample'] != 'hg38', ['sample', 'platform', 'haplotype', 'contig_id']].copy()

# assm_contigs.to_csv(
#     '/home/local/work/data/hgsvc/roi/20201109_3q29_bng_segments.contigs.tsv',
#     sep='\t',
#     header=True,
#     index=False
# )


seq_resolved = """
HG00096,CLR,H2
HG00512,CLR,H2
HG00514,CLR,H2
HG00733,CLR,H1
HG00733,CLR,H2
HG00864,CLR,H2
HG02492,CLR,H1
HG03065,CLR,H1
HG03065,CLR,H2
HG03371,CLR,H2
NA19650,CLR,H2
NA20509,CLR,H1
HG00096,CLR,H1
HG01505,CLR,H2
NA19239,CLR,H2
HG00514,HiFi,H1
HG02011,CLR,H2
HG03009,CLR,H1
NA19238,CLR,H2
NA19239,HiFi,H2
NA19983,CLR,H2
HG00731,HiFi,H2
HG02011,CLR,H1
HG03125,HiFi,H1
HG03486,HiFi,H2
NA19238,CLR,H1
"""

seq_resolved = set([tuple(x.split(',')) for x in seq_resolved.strip('"').split()])


def get_ovl_aln(start, end, alignments):
    
    select_start = alignments['align_start'] < end
    select_end = alignments['align_end'] > start
    
    check_set = alignments.loc[select_start & select_end, :]
    if check_set.empty:
        return 0, -1, 0
    
    orient_map = {
        '-': -1,
        '+': 1
    }
    
    max_ovl = 0
    ovl_mapq = -1
    orientation = 0
    for idx, row in check_set.iterrows():
        current_ovl = min(end, row['align_end']) - max(start, row['align_start'])
        if current_ovl > max_ovl:
            max_ovl = current_ovl
            ovl_mapq = row['mapq']
            orientation = orient_map[row['strand']]
            
    return max_ovl, ovl_mapq, orientation
    

def check_bng_consistency(bng_segments, segment_alignments, seglengths):
    """
    build list:
    color -> support, bng_orient, aln_orient, match_type, mapq
    """

    consistency = []
    
    orient_map = {
        'inverted': -1,
        'direct': 1
    }
    
    for idx, row in bng_segments.iterrows():
        contig_id = row['contig_id']
        predicted_color = row['segment_color']
        if predicted_color in ['no_value', 'no_resolution', 'single_label', 'black', 'unknown']:
            continue
        support = row['segment_support']
        if support == -1:
            # unclear case, skip
            continue
        try:
            seg_orientation = orient_map[row['orientation']]
        except KeyError:
            raise ValueError(row)
        select_aln = segment_alignments['contig_id'] == contig_id
        select_color = segment_alignments['aligned_color'] == predicted_color
        select_all = select_aln & select_color
        if not select_all.any():
            consistency.append((predicted_color, support, seg_orientation, 0, -1, -1))
            continue
        aln = segment_alignments.loc[select_all, :]
        seg_start = min(row['segment_start'], row['segment_end'])
        seg_end = max(row['segment_start'], row['segment_end'])
        
        ovl_bp, ovl_mq, ovl_strand = get_ovl_aln(seg_start, seg_end, aln)
        if ovl_mq > -1:
            consistency.append((predicted_color, support, seg_orientation, ovl_strand, 1, ovl_mq))
            continue
        shift_size = (seg_end - seg_start) // 2
        
        left_ovl_bp, left_ovl_mq, left_ovl_strand = get_ovl_aln(
            seg_start - shift_size,
            seg_end - shift_size,
            aln
        )
        
        right_ovl_bp, right_ovl_mq, right_ovl_strand = get_ovl_aln(
            seg_start + shift_size,
            seg_end + shift_size,
            aln
        )
        
        if left_ovl_bp == 0 and right_ovl_bp == 0:
            consistency.append((predicted_color, support, seg_orientation, 0, -1, -1))
            continue
        elif left_ovl_bp > right_ovl_bp:
            consistency.append((predicted_color, support, seg_orientation, left_ovl_strand, 0, left_ovl_mq))
            continue
        else:
            consistency.append((predicted_color, support, seg_orientation, right_ovl_strand, 0, right_ovl_mq))
    
    return consistency
        

def load_alignments(bed_folder):
    
    bed_columns = [
        'contig_id',
        'align_start',
        'align_end',
        'aligned_segment',
        'mapq',
        'strand'
    ]
    
    alignments = []
    for bed_file in os.listdir(bed_folder):
        if not bed_file.endswith('.bed'):
            continue
        sample, hap, tech = bed_file.split('.')[0].split('_')
        bed_path = os.path.join(bed_folder, bed_file)
        df = pd.read_csv(bed_path, sep='\t', header=None, names=bed_columns)
        df['aligned_color'] = df['aligned_segment'].apply(lambda x: x.split('_')[1])
        df['sample'] = sample
        df['hap'] = hap
        df['tech'] = tech
        alignments.append(df)
    alignments = pd.concat(alignments, axis=0, ignore_index=False)
    alignments.set_index(['sample', 'tech', 'hap'], drop=True, inplace=True)
    return alignments


def determine_segment_ref_length(bng_table):
    
    select_ref = bng_table['contig_id'] == 'reference'
    
    segment_lengths = dict()
    for position, (idx, row) in enumerate(bng_table.loc[select_ref, :].iterrows(), start=1):
        start = row['segment_start']
        end = row['segment_end']
        name = row['segment_color']
        name += '_{}'.format(position)
        assert start < end, 'ref segment flipped'
        length = end - start
        segment_lengths[name] = length
    return segment_lengths

seglen = determine_segment_ref_length(bng_table)
alignments = load_alignments(bed_path)

concordance = []
for (sample, tech, hap), segments in bng_table.groupby(['sample', 'platform', 'haplotype']):
    if sample == 'hg38':
        continue
    if tech == 'CCS':
        fully_res = (sample, 'HiFi', hap) in seq_resolved
    else:
        fully_res = (sample, tech, hap) in seq_resolved
    sub = alignments.xs([sample, tech, hap], level=['sample', 'tech', 'hap'])
    consistency = check_bng_consistency(segments, sub, seglen)
    consistency = pd.DataFrame(
        consistency,
        columns=[
            'segment_color',
            'segment_support',
            'segment_orientation',
            'aligned_orientation',
            'aligned_support',
            'aligned_mapq'
        ]
    )
    consistency['sample'] = sample
    consistency['tech'] = tech
    consistency['hap'] = hap
    consistency.set_index(['sample', 'tech', 'hap'], drop=True, inplace=True)
    concordance.append(consistency)
    
concordance = pd.concat(concordance, axis=0, ignore_index=False)

# concordance.to_csv(
#     '/home/local/work/data/hgsvc/roi/20201106_3q29_bng_segments.aln-con.tsv',
#     sep='\t',
#     header=True,
#     index=True
# )

print(concordance.shape[0])
print(concordance['aligned_support'].value_counts())

for (sample, tech, hap), stats in concordance.groupby(['sample', 'tech', 'hap']):
    fully_res = (sample, tech, hap) in seq_resolved
    if fully_res:
        if (stats['aligned_support'] > -1).all():
            print('=================')
            print(sample, tech, hap)
            print(stats)
raise

for (bng_support, aligned_support), matches in concordance.groupby(['segment_support', 'aligned_support']):
    print('============')
    print(bng_support, aligned_support)
    print(matches['aligned_mapq'].describe())
    print(matches['aligned_mapq'].value_counts())
    print('------------')
    


499
 1    346
-1    141
 0     12
Name: aligned_support, dtype: int64
HG00514 CLR H2
                 segment_color segment_support segment_orientation  \
sample  tech hap                                                     
HG00514 CLR  H2        magenta               2                   1   
             H2           blue               2                   1   
             H2         yellow               2                   1   
             H2            red               2                   1   
             H2         orange               2                   1   
             H2       darkblue               2                   1   
             H2         purple               2                   1   
             H2         yellow               2                   1   

                 aligned_orientation aligned_support aligned_mapq  
sample  tech hap                                                   
HG00514 CLR  H2                    1               1           60  
          

RuntimeError: No active exception to reraise