In [2]:
import os
import pandas as pd

path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/issues'
names = ['grch38_p13_gaps.tsv', 'grch38_p13_unknown.tsv', 'grch38_p13_variation.tsv']

header = [
    'Issue_ID',
    'Issue_Type',
    'Issue_Location',
    'Issue_TotalPlacements',
    'Issue_Status',
    'Issue_FixVersion',
    'Issue_GenomeBrowsers',
    'Issue_Summary'
]

def parse_coordinates(entry):
    chrom, coords = entry.split(':')
    start, end = coords.split('-')
    start = int(start.replace(',', ''))
    end = int(end.replace(',', ''))
    return chrom, start, end

all_issues = []

for n in names:
    file_path = os.path.join(path, n)
    df = pd.read_csv(file_path, sep='\t', names=header)
    df.drop('Issue_GenomeBrowsers', axis=1, inplace=True)
    # drop everything that is not located in hg38
    df = df.loc[~df['Issue_Location'].isna(), :].copy()
    # drop everything where a assembly version with a fix
    # is already indicated
    df = df.loc[df['Issue_FixVersion'].isna(), :].copy()
    df.reset_index(drop=True, inplace=True)
    df['Issue_FixVersion'] = 'not_indicated'
    df['Issue_Status'] = df['Issue_Status'].str.replace(' ', '_')
    df['Issue_Summary'] = '[' + df['Issue_Summary'] + ']'
    
    coords = df['Issue_Location'].map(parse_coordinates)
    coords = pd.DataFrame.from_records(coords, columns=['chrom', 'start', 'end'])
    coords.reset_index(drop=True, inplace=True)
    
    df = pd.concat([df, coords], axis=1, join='outer')
    df['Issue_TotalPlacements'] = df['Issue_TotalPlacements'].astype('int64')
    df.drop('Issue_Location', axis=1, inplace=True)
    df['start'] = df['start'].astype('int64')
    # convert to 0-based for BED output
    df['end'] = df['end'].astype('int64') + 1
    df['name'] = df['Issue_Type'] + '_' + df['Issue_ID']
    df['score'] = 1000
    df['strand'] = '.'
    all_issues.append(df)


all_issues = pd.concat(all_issues, axis=0)
all_issues.sort_values(['chrom', 'start', 'end'], inplace=True)
sort_order = [
    'chrom',
    'start',
    'end',
    'name',
    'score',
    'strand',
    'Issue_ID',
    'Issue_Type',
    'Issue_Status',
    'Issue_FixVersion',
    'Issue_TotalPlacements',
    'Issue_Summary'  
]

all_issues = all_issues[sort_order]
out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38'

# dump as BED file for intersect operations
outfile = os.path.join(out_path, '20200723_GRCh38_p13_unresolved-issues.bed')
with open(outfile, 'w') as dump:
    _ = dump.write('#')
    all_issues.to_csv(dump, sep='\t', index=False, header=True)

