In [1]:
from pathlib import Path
import pysam
from vesper.processors.variants import *
from vesper.processors.reads import *
from vesper.processors.annotations import *
from vesper.models.variants import *


In [None]:
vcf_path = Path("../tests/files/hg38/894.duplomap.vcf.gz")
bam_path = Path("../tests/files/hg38/894.duplomap.bam")
centromere_path = Path("../annotations/hg38/GRCH38_centromeres.bed")
repeatmasker_path = Path("../annotations/hg38/GRCH38_repeatmasker.bed")

variants = []
with ReadProcessor(bam_path) as read_proc:
    with VCFProcessor(vcf_path, read_proc) as vcf_proc:
        # TODO: uh, maybe lazy loading of reads + annotation, skip variants with excess support reads at locus... uhh... 250k reads is not normal
        variants = list(vcf_proc.instantiate_variants(test_mode = True))

print(f"Loaded {len(variants)} variants")


In [None]:
variants[0]

In [None]:
with BEDProcessor(repeatmasker_path) as ann_proc:
    for variant in variants:
        if variant.variant.sv_type == SVType.INS:
            variant.add_annotations(ann_proc)

In [None]:
reads_processed, max_reads_per_variant, max_reads_variant = 0, 0, None
for v in variants:
    reads_processed += len(v.support_reads) + len(v.nonsupport_reads)
    if len(v.support_reads) + len(v.nonsupport_reads) > max_reads_per_variant:
        max_reads_per_variant = len(v.support_reads) + len(v.nonsupport_reads)
        max_reads_variant = v

print(f"Processed {reads_processed} reads")
print(f"Max reads per variant: {max_reads_per_variant}")
print(f"Variant with max reads: {max_reads_variant}") 

In [None]:
len([v for v in variants if len(v.overlapping_features) > 0])