In [26]:
import pandas as pd
import os

"""
What does this do?
Compute basic statistics for region sets representing "accessible regions" or "coverage masks".
Baseline is 1000GP Illumina masks, stats have been "cached" manually in this notebook to speed
up computations.
This notebook only prints stats and does not produce the Supp. Figure
"""


def get_bed_size(file_path):
    df = pd.read_csv(
        file_path,
        sep='\t',
        header=0,
        names=['chrom', 'start', 'end'],
        usecols=[0, 1, 2]
    )
    total = (df['end'] - df['start']).sum()
    return total

def compute_region_stats(df):
    
    chroms_in_df = set(df['chrom'].values)
    
    chrom_set = set()
    
    for c in chroms_in_df:
        if c == 'chrX':
            chrom_set.add('X')
        elif c == 'chrY':
            chrom_set.add('Y')
        else:
            try:
                n = int(c.strip('chr'))
            except ValueError:
                raise
            else:
                if n in list(range(1, 23)):
                    chrom_set.add('1-22')
                else:
                    raise
    infos = dict()
    chrom_set = ''.join(sorted(chrom_set))
    infos['chroms'] = chrom_set
    df['length'] = df['end'] - df['start']
    infos['region_count'] = int(df.shape[0])
    
    len_stats = df['length'].describe()
    infos['median_length'] = int(len_stats['50%'])
    infos['mean_length'] = int(len_stats['mean'])
    n50_region_size = None
    total = df['length'].sum()
    infos['total_length'] = int(total)
    df.sort_values('length', ascending=False, inplace=True)
    covered = 0
    for idx, row in df.iterrows():
        covered += row['length']
        if (covered / total) >= 0.5:
            n50_region_size = int(row['length'])
            break
    infos['n50_length'] = n50_region_size
    return infos
            
        
mapq_based = 'ctgcov_32smp_122XY_anyQ60.bed'
scaffold_based = 'scfcov_32smp_122XY_anyMedHigh.bed'
ours = os.path.join('/home/local/work/data/hgsvc/accessible_regions', scaffold_based)

df = pd.read_csv(ours, sep='\t', usecols=[0, 1, 2], names=['chrom', 'start', 'end'], header=0)
our_infos = compute_region_stats(df)
total = our_infos['total_length']
print(os.path.basename(ours))
print(our_infos)
print('ours ', total)

theirs_pilot = '/home/local/work/data/hgsvc/1kg_masks/20160622.allChr.pilot_mask.bed'
#pilot_df = pd.read_csv(theirs_pilot, sep='\t', usecols=[0, 1, 2], names=['chrom', 'start', 'end'])
#pilot_infos = compute_region_stats(pilot_df)
pilot_infos = {
    'chroms': '1-22XY',
    'region_count': 562233,
    'median_length': 240,
    'mean_length': 4888,
    'total_length': 2748585120,
    'n50_length': 20126
}
print('========== Illumina pilot')
print(os.path.basename(theirs_pilot))
print(pilot_infos)
total2 = pilot_infos['total_length']
print('theirs pilot ', total2)

theirs_strict = '/home/local/work/data/hgsvc/1kg_masks/20160622.allChr.strict_mask.bed'
#strict_df = pd.read_csv(theirs_strict, sep='\t', usecols=[0, 1, 2], names=['chrom', 'start', 'end'])
#strict_infos = compute_region_stats(strict_df)
strict_infos = {
    'chroms': '1-22XY',
    'region_count': 4155228,
    'median_length': 79,
    'mean_length': 550,
    'total_length': 2288053789,
    'n50_length': 2158
}
total3 = strict_infos['total_length']
print('========== Illumina strict')
print(os.path.basename(theirs_strict))
print(strict_infos)
print('theirs strict ', total3)

theirs_combined = '/home/local/work/data/hgsvc/1kg_masks/pilot_strict_combined.allChr.mask.bed'
#comb_df = pd.read_csv(theirs_combined, sep='\t', usecols=[0, 1, 2], names=['chrom', 'start', 'end'])
#comb_infos = compute_region_stats(comb_df)
comb_infos = {
    'chroms': '1-22XY',
    'region_count': 8678465,
    'median_length': 58,
    'mean_length': 316,
    'total_length': 2748585120,
    'n50_length': 1695
}
print('========== Illumina combined')
print(os.path.basename(theirs_combined))
print(comb_infos)
total4 = comb_infos['total_length']
print('theirs combined ', total4)
raise

print(round(total / total2, 2))
print(round(total / total3, 2))
print(round(total / total4, 2))

print('==================')
grch38_path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38'
grch38_all = get_bed_size(
    os.path.join(grch38_path, 'GRCh38_HGSVC2_noalt.nogap.regions')
)
grch38_primary = get_bed_size(
    os.path.join(grch38_path, 'GRCh38_HGSVC2_noalt.nogap.122X.regions')
)

centromeres = get_bed_size(
    os.path.join(grch38_path, '20200723_GRCh38_p13_centromeres.122X.bed')
)

# approx
grch38_primary_nocen = grch38_primary - centromeres

print('All ', grch38_all)
print('Primary ', grch38_primary)
print('Primary no cen ', grch38_primary_nocen)
print('centromeres ', centromeres)
uncharted_ours = grch38_primary - total
uncharted_pilot = grch38_primary - total2
uncharted_strict = grch38_primary - total3
print('Primary - ours ', uncharted_ours)
print('Primary - pilot ', uncharted_pilot)
print('Primary - strict ', uncharted_strict)
print('Rough calculation only - centromeres have N gaps, but not to 100%')
print('remain ours (no cen) ', uncharted_ours - centromeres)
print('remain pilot (no cen)', uncharted_pilot - centromeres)
print('remain strict (no cen)', uncharted_strict - centromeres)

illumina_mask = get_bed_size(
    os.path.join('/home/local/work/data/hgsvc/1kg_masks', 'illumina_masked.giemsa.nogap.122X.bed')
)
print(illumina_mask)
print(2395484832 - illumina_mask)

scfcov_32smp_122XY_anyMedHigh.bed
{'chroms': '1-22XY', 'region_count': 243, 'median_length': 479740, 'mean_length': 11808854, 'total_length': 2869551650, 'n50_length': 60993192}
ours  2869551650
20160622.allChr.pilot_mask.bed
{'chroms': '1-22XY', 'region_count': 562233, 'median_length': 240, 'mean_length': 4888, 'total_length': 2748585120, 'n50_length': 20126}
theirs pilot  2748585120
20160622.allChr.strict_mask.bed
{'chroms': '1-22XY', 'region_count': 4155228, 'median_length': 79, 'mean_length': 550, 'total_length': 2288053789, 'n50_length': 2158}
theirs strict  2288053789
pilot_strict_combined.allChr.mask.bed
{'chroms': '1-22XY', 'region_count': 8678465, 'median_length': 58, 'mean_length': 316, 'total_length': 2748585120, 'n50_length': 1695}
theirs combined  2748585120


RuntimeError: No active exception to reraise