In [1]:
from pathlib import Path
import pysam
from vesper.processors.variants import *
from vesper.processors.reads import *
from vesper.processors.annotations import *
from vesper.models.variants import *

import numpy as np
import time

In [2]:
vcf_path = Path("../tests/files/hg38/894.duplomap.vcf.gz")
bam_path = Path("../tests/files/hg38/894.duplomap.bam")
centromere_path = Path("../annotations/hg38/GRCH38_centromeres.bed")
repeatmasker_path = Path("../annotations/hg38/GRCH38_repeatmasker.bed")

variants = []
with VCFProcessor(vcf_path) as vcf_proc:
    variants = list(vcf_proc.instantiate_variants(test_mode = False))

print(f"Loaded {len(variants)} variants")

Loaded 1867 variants


In [5]:
# test varying the number of workers: processing time should decrease as # of workers increases

for n_workers in range(16, 33, 8):
    start_time = time.time()
    with BEDProcessor(repeatmasker_path) as ann_proc:
        ann_proc.annotate_variants(
            variants,
            proximal_span=500,
            n_workers=n_workers
        )
    end_time = time.time()
    print(f"Thread Pool with {n_workers} workers: Annotated in {end_time - start_time:.2f} seconds")

Thread Pool with 16 workers: Annotated in 2.40 seconds
Thread Pool with 24 workers: Annotated in 2.27 seconds
Thread Pool with 32 workers: Annotated in 2.30 seconds


In [None]:
# test varying the span of annotations: processing time should stay about the same
# while # of annotations per variant increases

with BEDProcessor(repeatmasker_path) as ann_proc:
    n_workers = 32
    print(f"\nWith {n_workers} workers:")
    for span in range(100, 501, 100):
        print(f"\nTesting proximal span {span}bp:")

        start_time = time.time()
        ann_proc.annotate_variants(
            variants,
            proximal_span=span,
            n_workers=n_workers
        )
        end_time = time.time()
        
        # Calculate annotation stats
        annotations_per_variant = [
            len(v.overlapping_features) + len(v.proximal_features) 
            for v in variants
        ]
        mean_annotations = np.mean(annotations_per_variant)
        median_annotations = np.median(annotations_per_variant)
        std_annotations = np.std(annotations_per_variant)
        
        print(f"Mean annotations per variant: {mean_annotations:.1f}")
        print(f"Median annotations per variant: {median_annotations:.1f}")
        print(f"Std dev annotations per variant: {std_annotations:.1f}")
        print(f"Processing time: {end_time - start_time:.2f} seconds")


With 32 workers:

Testing proximal span 100bp:
Mean annotations per variant: 22.3
Median annotations per variant: 16.0
Std dev annotations per variant: 19.0
Processing time: 2.33 seconds

Testing proximal span 200bp:
Mean annotations per variant: 23.4
Median annotations per variant: 17.0
Std dev annotations per variant: 20.0
Processing time: 2.29 seconds

Testing proximal span 300bp:
Mean annotations per variant: 24.9
Median annotations per variant: 18.0
Std dev annotations per variant: 21.2
Processing time: 2.33 seconds

Testing proximal span 400bp:
Mean annotations per variant: 26.6
Median annotations per variant: 19.0
Std dev annotations per variant: 22.7
Processing time: 2.36 seconds

Testing proximal span 500bp:
Mean annotations per variant: 28.6
Median annotations per variant: 20.0
Std dev annotations per variant: 24.3
Processing time: 2.28 seconds
