In [19]:
import os
import pandas as pd

chrom_match = 'chr[0-9XY]+$'

# overlap between freeze3 "freeze3.sv.sym.vcf.gz"
# total 61514 for chr 1-22,X,Y
# and
# contig coverage / any MAPQ60
# 59999 (non-ovl 1515, - (50 + 87 + 156)) = 1222 ; because of excl. samples
# scaffold coverage / any medium or high confidence
# 61378 (non-ovl 136 - (20 + 3) = 113)

def load_bed_coverage(file_path, header=None):
    
    df = pd.read_csv(
        file_path,
        sep='\t',
        usecols=[0, 1, 2],
        names=['chrom', 'start', 'end'],
        header=header
    )
    df = df.loc[df['chrom'].str.match(chrom_match), :].copy()
    df['length'] = df['end'] - df['start']
    total_length = df['length'].sum()
    median_length = df['length'].median()
    average_length = df['length'].mean()
    return df.shape[0], round(total_length/1e3, 2), median_length.round(0), average_length.round(0)


def load_bed_distance(file_path):
    
    df = pd.read_csv(
        file_path,
        sep='\t',
        usecols=[0, 1, 2, 10],
        names=['chrom', 'start', 'end', 'dist'],
        header=None
    )
    df = df.loc[df['chrom'].str.match(chrom_match), :].copy()
    df['length'] = df['end'] - df['start']
    total_length = df['length'].sum()
    median_dist = df['dist'].median()
    average_dist = df['dist'].mean()
    return df.shape[0], total_length, median_dist.round(0), average_dist.round(0)
    

ill_strict = '/home/local/work/data/hgsvc/1kg_masks/20160622.allChr.strict_mask.bed'
ill_lenient = '/home/local/work/data/hgsvc/1kg_masks/20160622.allChr.pilot_mask.bed'

print('Illumina strict: ', load_bed_coverage(ill_strict))
print('Illumina lenient: ', load_bed_coverage(ill_lenient))

path = '/home/local/work/data/hgsvc/accessible_regions'

ctg_cov = os.path.join(path, 'ctgcov_32smp_122XY_anyQ60.bed')
scf_cov = os.path.join(path, 'scfcov_32smp_122XY_anyMedHigh.bed')

print('ctg cov Q60 ', load_bed_coverage(ctg_cov, 0))
print('scf cov med/high ', load_bed_coverage(scf_cov, 0))

print('==== Illumina only ===')

ill_no_ctg60 = os.path.join(path, 'illumina_strict_minus_ctgQ60.bed')
ill_no_scf = os.path.join(path, 'illumina_strict_minus_scf.bed')

print('strict - Q60 ', load_bed_coverage(ill_no_ctg60))
print('strict - SCF ', load_bed_coverage(ill_no_scf))

ill_no_ctg60_cen = os.path.join(path, 'illumina_strict_minus_ctgQ60_minus_cen.bed')
ill_no_scf_cen = os.path.join(path, 'illumina_strict_minus_scf_minus_cen.bed')

print('strict - Q60 / no cen', load_bed_coverage(ill_no_ctg60_cen))
print('strict - SCF / no cen ', load_bed_coverage(ill_no_scf_cen))

ill_no_ctg60_cen_sdist = os.path.join(path, 'illumina_strict_minus_ctgQ60_minus_cen_sd-dist.bed')
ill_no_scf_cen_sdist = os.path.join(path, 'illumina_strict_minus_scf_minus_cen_sd-dist.bed')

print('strict - Q60 / no cen / SD ', load_bed_distance(ill_no_ctg60_cen_sdist))
print('strict - SCF / no cen / SD ', load_bed_distance(ill_no_scf_cen_sdist))

print('====== assembly only ====')

ctg_no_strict = os.path.join(path, 'ctgcov_32smp_122XY_anyQ60_minus_strict.bed')
ctg_no_pilot = os.path.join(path, 'ctgcov_32smp_122XY_anyQ60_minus_pilot.bed')

scf_no_strict = os.path.join(path, 'scfcov_32smp_122XY_anyMedHigh_minus_strict.bed')
scf_no_pilot = os.path.join(path, 'scfcov_32smp_122XY_anyMedHigh_minus_pilot.bed')

print('Q60 - strict ', load_bed_coverage(ctg_no_strict))
print('Q60 - pilot ', load_bed_coverage(ctg_no_pilot))

print('SCF - strict ', load_bed_coverage(scf_no_strict))
print('SCF - pilot ', load_bed_coverage(scf_no_pilot))

Illumina strict:  (4155228, 2288053.79, 79.0, 551.0)
Illumina lenient:  (562233, 2748585.12, 240.0, 4889.0)
ctg cov Q60  (250, 2869392.54, 396946.0, 11477570.0)
scf cov med/high  (243, 2869551.65, 479740.0, 11808855.0)
==== Illumina only ===
strict - Q60  (4479, 848.11, 56.0, 189.0)
strict - SCF  (4164, 121.02, 14.0, 29.0)
strict - Q60 / no cen (4446, 847.77, 58.0, 191.0)
strict - SCF / no cen  (4071, 119.82, 15.0, 29.0)
strict - Q60 / no cen / SD  (4446, 847770, 40052.0, 53545.0)
strict - SCF / no cen / SD  (4071, 119824, 0.0, 5864.0)
Q60 - strict  (4150999, 582186.86, 53.0, 140.0)
Q60 - pilot  (552863, 122879.15, 76.0, 222.0)
SCF - strict  (4151307, 581618.88, 53.0, 140.0)
SCF - pilot  (544537, 123798.57, 74.0, 227.0)
