In [27]:
import pandas as pd

"""
What does this do?
Compute "size" statistics for various subsets of GRCh38 regions
Not immediately used for the paper, but could server to update
Supp. Table with coverage statistics.
"""

fai = '/home/local/work/data/hgsvc/noalt_ref/GRCh38_HGSVC2_noalt.fasta.fai'

df = pd.read_csv(fai, sep='\t', usecols=[0, 1], names=['chrom', 'size'])

grch38_chroms = set(df['chrom'])

unplaced = 'chrUn'
unlocalized = 'chr[0-9XYM]+_\w+'
other = 'chr[0-9XYM]+$'

counts = {
    'total_n': df.shape[0],
    'total_bp': df['size'].sum()
}

for label, select in zip(['unplaced', 'unlocalized', 'other'], [unplaced, unlocalized, other]):
    sub = df.loc[df['chrom'].str.match(select), :]
    counts['{}_n'.format(label)] = sub.shape[0]
    counts['{}_bp'.format(label)] = sub['size'].sum()

components = 0
for k, v in counts.items():
    print(k, v)
    if not k.startswith('total') and k.endswith('_bp'):
        components += v
        
assert components - counts['total_bp'] == 0, 'fail'

print('===== gaps')

ngaps_file = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/ucsc_Ngaps.tsv'

ngaps = pd.read_csv(ngaps_file, sep='\t', header=0)
assert (ngaps['n'] == 'N').all(), 'unknown gap size'
ngaps_chroms = set(ngaps['#chrom'])
drop_chroms = set()
for c in ngaps_chroms:
    if c not in grch38_chroms:
        assert c.endswith('alt') or c.endswith('fix'), 'unmatched chrom'
        drop_chroms.add(c)
        
ngaps = ngaps.loc[~ngaps['#chrom'].isin(drop_chroms), :].copy()
print(ngaps.shape[0], ' - ', ngaps['size'].sum())
for gap_type in ngaps['type'].unique():
    sub = ngaps.loc[ngaps['type'] == gap_type, :]
    print(gap_type, ' - ', sub.shape[0], ' - ', sub['size'].sum())

print('===== bands / Giemsa staining')

bands_file = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions/ucsc_cytoband.bed'

bands = pd.read_csv(bands_file, sep='\t', header=0)
bands['size'] = bands['chromEnd'] - bands['chromStart']
band_chroms = set(bands['#chrom'])
drop_chroms = set()
for c in band_chroms:
    if c not in grch38_chroms:
        assert c.endswith('alt') or c.endswith('fix'), 'unmatched chrom'
        drop_chroms.add(c)

bands = bands.loc[~bands['#chrom'].isin(drop_chroms), :].copy()
for staining in bands['gieStain'].unique():
    sub = bands.loc[bands['gieStain'] == staining, :]
    print(staining, ' - ', sub.shape[0], ' - ', sub['size'].sum())

total_n 194
total_bp 3099750718
unplaced_n 127
unplaced_bp 4485509
unlocalized_n 42
unlocalized_bp 6978808
other_n 25
other_bp 3088286401
===== gaps
695  -  151102971
telomere  -  48  -  480000
contig  -  285  -  10220364
scaffold  -  346  -  3505607
heterochromatin  -  11  -  72427000
short_arm  -  5  -  64470000
===== bands / Giemsa staining
gneg  -  584  -  1315291132
gpos25  -  87  -  214232171
gpos50  -  121  -  410000000
gpos75  -  89  -  411000000
gpos100  -  81  -  495600000
acen  -  48  -  92900000
gvar  -  17  -  136327415
stalk  -  5  -  24400000
