In [11]:
import os
import pandas as pd

"""
What does this do?
Processes a TSV dump of Feyza's segment annotation file for 3q29
and produces a flattened table with one segment per row.

This flat table is used downstream to evaluate the concordance
between Bionano and the phased assemblies.

Because the input table is generated manually, several hard-coded
fixes that may or may not break with updated versions of the table.
"""


path = '/home/local/work/data/hgsvc/roi'
table = '20201109_3q29_PhasedAssembliesSegmentAnnotation_V1.tsv'
output = '20201109_3q29_bng_segments.flat.tsv'

locus_start = 195607154
locus_end = 196027006

def segment_splitter(table_row):
    segment_string = table_row["Segmentsintheregion(5'to3')"]
    row_index = table_row.name
    if segment_string == 'single label' or segment_string == 'singlelabel':
        return [(row_index, 'single_label', -1, -1, -1)]
    elif segment_string == 'notenoughresolution':
        return [(row_index, 'no_resolution', -1, -1, -1)]
    else:
        pass
    try:
        parts = segment_string.split(';')
    except AttributeError as err:
        # NaN value
        return [(row_index, 'no_value', -1, -1, -1)]
    segment_chain = []
    for p in parts:
        try:
            # fix one manually for ref
            if p.strip() == 'purple:195933670195969451':
                color = 'purple'
                coords = '195933670-195969451'
            elif p.strip() == 'yellow:22658292260051':
                color = 'yellow'
                coords = '2265829-2260051'
            else:
                color, coords = p.split(':')
                color = color.lower().strip().replace(' ', '')
                coords = coords.replace(',', '')
            if 'singlelabel' in color or 'single label' in color:
                color = color.split('(')[0]
                support = 0
            if 'partial' in color:
                color = color.replace('partial', '').strip()
                support = 1
            else:
                support = 2
            try:
                start, end = coords.split('-')
            except ValueError as err:
                # this is the single colored label case
                # this should work / if raises, too bad...
                start = int(coords)
                end = start + 6
                support = 0
            segment_chain.append((row_index, color, int(start), int(end), support))
        except (ValueError,IndexError) as err:
            print('Cannot parse {} / {} / {}'.format(row_index, p, table_row))
            segment_chain.append((row_index, 'parse_error', -1, -1, -1))

    return segment_chain
    

def sv_splitter(table_row):
    row_index = table_row.name
    sv_string = table_row["SVs(5'to3',inorderofappearance)"]
    if pd.isna(sv_string):
        return (row_index, 'no_variant', 'no_segment_chain', -1, -1)
    sv_desc, coords = sv_string.split(':')
    sv_type, segment_chain = sv_desc.split('(')
    segment_chain = [s.strip() for s in segment_chain.strip('()').split('+')]
    segment_chain = '|'.join(segment_chain)
    start, end = coords.split('-')
    return (row_index, sv_type, segment_chain, int(start), int(end))


def normalize_sample_annotation(table_row):
    row_index = table_row.name
    if pd.isna(table_row['Population']):
        if table_row['SampleID'] == 'NA24385':
            pop = 'ASK'
        else:
            raise ValueError(table_row)
    else:
        pop = table_row['Population']
    try:
        start = int(table_row['Start'])
    except ValueError:
        start = -1
    try:
        end = int(table_row['End'])
    except ValueError:
        if start != -1:
            end = start + 6 # single label case
        else:
            end = -1
    bng_ctg_id = table_row['Haplotype-ContigID']
    haplotype = bng_ctg_id.split('-')[0].upper()
    tech = table_row['AssemblyType'].split('-')[0]
    sample = table_row['SampleID']
    if sample.startswith('GM'):
        sample = sample.replace('GM', 'NA')
    if sample == 'NA00864':
        sample = 'HG00864'
    infos = (
        row_index,
        sample,
        pop,
        tech,
        'phased',
        haplotype,
        bng_ctg_id,
        start,
        end,
        table_row['Orientation'],
        int(table_row['CompntLength']),
        table_row['ClusterID']
    )
    return infos
        

df = pd.read_csv(
    os.path.join(path, table),
    sep='\t',
    header=0
)
select_ref = df['SampleID'] == 'hg38'
df.loc[select_ref, 'AssemblyType'] = 'reference'
df.loc[select_ref, 'Population'] = 'ALL'
df.loc[select_ref, 'Haplotype-ContigID'] = 'reference'
df.loc[select_ref, 'Orientation'] = 'reference'
df.loc[select_ref, 'ClusterID'] = 'reference'
df.loc[select_ref, 'CompntLength'] = locus_end - locus_start
df['bng_row_index'] = df.index

segments = df.apply(segment_splitter, axis=1, raw=False)
flattened_segments = []
for segment_chain in segments:
    flattened_segments.extend(segment_chain)
    
segments = pd.DataFrame(
    flattened_segments,
    columns=[
        'bng_row_index',
        'segment_color',
        'segment_start',
        'segment_end',
        'segment_support',
    ],
)
segments['segment_start'] = segments['segment_start'].astype('int64')
segments['segment_end'] = segments['segment_end'].astype('int64')

struct_vars = df.apply(sv_splitter, axis=1, raw=False)
struct_vars = pd.DataFrame.from_records(
    struct_vars,
    columns=[
        'bng_row_index',
        'variant_type',
        'variant_segment_chain',
        'variant_start',
        'variant_end'
    ]
)

assembly_infos = df.apply(normalize_sample_annotation, axis=1, raw=False)
assembly_infos = pd.DataFrame.from_records(
    assembly_infos,
    columns=[
        'bng_row_index',
        'sample',
        'population',
        'platform',
        'assembly_type',
        'haplotype',
        'bng_contig_id',
        'component_start',
        'component_end',
        'orientation',
        'component_length',
        'contig_id'
    ]
)

clean_table = pd.merge(assembly_infos, struct_vars, on='bng_row_index', how='outer')
clean_table = pd.merge(clean_table, segments, on='bng_row_index', how='outer')

clean_table.to_csv(
    os.path.join(path, output),
    sep='\t',
    header=True,
    index=False
)
