In [1]:
from pathlib import Path
import pysam
from vesper.processors.variants import *
from vesper.processors.reads import *
from vesper.processors.annotations import *
from vesper.models.variants import *


In [2]:
vcf_path = Path("../tests/files/hg38/894.duplomap.vcf.gz")
bam_path = Path("../tests/files/hg38/894.duplomap.bam")
centromere_path = Path("../annotations/hg38/GRCH38_centromeres.bed")
repeatmasker_path = Path("../annotations/hg38/GRCH38_repeatmasker.bed")

variants = []
# with ReadProcessor(bam_path) as read_proc:
#     with VCFProcessor(vcf_path, read_proc) as vcf_proc:
#         # TODO: uh, maybe lazy loading of reads + annotation, skip variants with excess support reads at locus... uhh... 250k reads is not normal
#         variants = list(vcf_proc.instantiate_variants(test_mode = False))
with VCFProcessor(vcf_path) as vcf_proc:
    # TODO: uh, maybe lazy loading of reads + annotation, skip variants with excess support reads at locus... uhh... 250k reads is not normal
    variants = list(vcf_proc.instantiate_variants(test_mode = False))

print(f"Loaded {len(variants)} variants")


Loaded 1867 variants


In [3]:
variants[0]


Variant(contig=chr1, pos=1184927, type=INS, len=108, DR=29, DV=2),
confidence=0.0,
support_reads=0,
nonsupport_reads=0,
overlapping_features=0,
proximal_features=0

In [4]:
with BEDProcessor(repeatmasker_path) as ann_proc:
    for variant in variants:
        if variant.variant.sv_type == SVType.INS:
            variant.add_annotations(ann_proc)

In [None]:
# only run this if pulling reads from BAM with ReadProcessor

reads_processed, max_reads_per_variant, max_reads_variant = 0, 0, None
for v in variants:
    reads_processed += len(v.support_reads) + len(v.nonsupport_reads)
    if len(v.support_reads) + len(v.nonsupport_reads) > max_reads_per_variant:
        max_reads_per_variant = len(v.support_reads) + len(v.nonsupport_reads)
        max_reads_variant = v

print(f"Processed {reads_processed} reads")
print(f"Max reads per variant: {max_reads_per_variant}")
print(f"Variant with max reads: {max_reads_variant}") 

In [5]:
annotations_processed, max_annotations_per_variant, max_annotations_variant = 0, 0, None
for v in variants:
    total_annotations = len(v.overlapping_features) + len(v.proximal_features)
    annotations_processed += total_annotations
    if total_annotations > max_annotations_per_variant:
        max_annotations_per_variant = total_annotations
        max_annotations_variant = v

print(f"Processed {annotations_processed} annotations")
print(f"Max annotations per variant: {max_annotations_per_variant}")
print(f"Variant with max annotations: {max_annotations_variant}")

Processed 3784 annotations
Max annotations per variant: 13
Variant with max annotations: 
Variant(contig=chr11, pos=120328859, type=INS, len=188, DR=48, DV=1),
confidence=0.0,
support_reads=0,
nonsupport_reads=0,
overlapping_features=1,
proximal_features=12
