In [2]:
import os
import pandas as pd

path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions'
out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38'
names = ['Modeled_regions_for_GRCh38.tsv', 'GRCh38_p13_chromXY_PAR.tsv', 'ucsc_cytoband.bed']

header = [
    'name',
    'chrom',
    'start',
    'end',
    'length'
]

ucsc_header = [
    'chrom',
    'start',
    'end',
    'name',
    'gieStain'
]

sort_order = [
    'chrom',
    'start',
    'end',
    'name',
    'score',
]


chrom_file = 'hg38_hgsvc2_noalt.chroms.tsv'
chroms = pd.read_csv(os.path.join(path, chrom_file), sep='\t', names=['chrom', 'length'])

all_regions = []

for n, hd in zip(names, [header, header, ucsc_header]):
    print(n)
    file_path = os.path.join(path, n)
    df = pd.read_csv(file_path, sep='\t', names=hd, header=None, comment='#')
    if not 'ucsc' in n:
        # for BED output, convert to 0-based
        df['end'] += 1
        df['chrom'] = 'chr' + df['chrom']
    if 'ucsc' in n:
        df.loc[~df['name'].isna(), 'name'] = df['chrom'] + df['name']
        df.loc[df['name'].isna(), 'name'] = 'ctg'
        df = df.loc[df['chrom'].isin(chroms['chrom']), :].copy()
    else:
        df['name'] = df['name'].str.replace('#', '')
    df['strand'] = '.'
    df['score'] = 1000
    if 'ucsc' not in n:
        par_regions = df['name'].str.startswith('PAR')
        df.loc[df['name'].str.startswith('HET'), 'score'] = 750
        df.loc[par_regions, 'score'] = 500
        df.loc[par_regions, 'name'] = df.loc[par_regions, 'name'] + df.loc[par_regions, 'chrom'].str.strip('chr')
    if 'gieStain' in df.columns:
        df['gieStain_value'] = df['gieStain'].str.extract('([0-9]+)', expand=False)
        df.loc[df['gieStain_value'].isna(), 'gieStain_value'] = '0'
        df['gieStain_value'] = df['gieStain_value'].astype('int16')
        df['gieStain_value'] *= 10
        df['score'] = df['gieStain_value']

    df['score'] = df['score'].astype('int16')
    if 'ucsc' in n:
        out_cyto = os.path.join(out_path, 'GRCh38_cytobands.bed')
        df = df[sort_order]
        with open(out_cyto, 'w') as dump:
            _ = dump.write('#')
            df.to_csv(dump, sep='\t', header=True, index=False)
    else:
        all_regions.append(df)

all_regions = pd.concat(all_regions, axis=0)
all_regions.sort_values(['chrom', 'start', 'end'], inplace=True)
sort_order = [
    'chrom',
    'start',
    'end',
    'name',
    'score',
]

all_regions = all_regions[sort_order]
out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38'

# dump as BED file for intersect operations
outfile = os.path.join(out_path, '20200723_GRCh38_p13_regions.bed')
with open(outfile, 'w') as dump:
    _ = dump.write('#')
    all_regions.to_csv(dump, sep='\t', index=False, header=True)



Modeled_regions_for_GRCh38.tsv
     name  chrom      start        end   length strand  score
0    CEN1   chr1  122026460  125184588  3158128      .   1000
1    CEN2   chr2   92188146   94090558  1902412      .   1000
2    CEN3   chr3   90772459   93655575  2883116      .   1000
3    CEN4   chr4   49708101   51743952  2035851      .   1000
4    CEN5   chr5   46485901   50059808  3573907      .   1000
5    CEN6   chr6   58553889   59829935  1276046      .   1000
6    CEN7   chr7   58169654   60828235  2658581      .   1000
7    HET7   chr7   61377789   61528021   150232      .    750
8    CEN8   chr8   44033745   45877266  1843521      .   1000
9    CEN9   chr9   43236168   45518559  2282391      .   1000
10  CEN10  chr10   39686683   41593522  1906839      .   1000
11  CEN11  chr11   51078349   54425075  3346726      .   1000
12  CEN12  chr12   34769408   37185253  2415845      .   1000
13  CEN13  chr13   16000001   18051249  2051248      .   1000
14  CEN14  chr14   16000001   18173524 