
**I/O handling**
- [x] Input: BAM with no duplicated alignments
- [x] Output: VCFv4.2 (Check if any VCF already exists)

**Prepare PC coordinates based on reference SD map from WGAC**
- [ ] Compute fragment size distribution
- [ ] Pick multi-aligned regions (within target regions) and return as BED
- [ ] 

**Misc**
- [ ] Thread usage: 4 (non-wgs) or 8 (wgs)



In [2]:
import pysam
from utils import *
import os
import pandas as pd
import numpy as np

In [3]:
bamfp = "/home/louisshe/SDrecall/data/test_bqsr_bam/HG002.bqsr.bam"

os.chdir("/home/louisshe/work/SDrecall/")

In [4]:
# Exclude duplicate alignments in BAM
bamf = pysam.AlignmentFile(bamfp, "rb")

for alignment in bamf:
    assert not alignment.is_duplicate, "Duplicate alignments found in BAM file. Abort."
    
bamf.close()

# Assign output VCF name
vcf_out_fn = os.path.basename(bamfp).split(".")[0] + ".homo_region.filtered.vcf"

In [1]:
import pybedtools
pybedtools.set_bedtools_path("/software/bedtools/2.30.0/bin")

bed_obj = pybedtools.BedTool("/paedyl01/disk1/louisshe/ref/dmd_modifiers_coord.bed").sort().merge().as_intervalfile()

In [45]:
def read_evaluator(read, filter_tags, filter_logic, min_mapq):
    '''
    This function retrieves query names of valid reads.
    '''
    assert isinstance(filter_tags, list), f"Filter tags must be a list of strings. ({filter_tags})"
    
    filter_pass = True
    
    # Retrieve alignments with filter e.g. XA
    if len(filter_tags) > 0:
        if filter_logic == "or":
            filter_pass = any([read.has_tag(tag) for tag in filter_tags])
        elif filter_logic == "and":
            filter_pass = all([read.has_tag(tag) for tag in filter_tags])
        elif filter_logic == "not":
            filter_pass = not any([read.has_tag(tag) for tag in filter_tags])
        else:
            logger.warning(f"Unsupported filter logic ({filter_logic}). Omitting filter ... ")
    
    # Assign filter_pass as True if no filter is imposed
    filter_pass = filter_pass and True

    # Select good reads
    candidate_reads = not (read.is_duplicate or read.is_unmapped or read.is_secondary or read.is_supplementary or read.is_qcfail or read.mapping_quality >= min_mapq) and filter_pass
    
    if not candidate_reads:
        return
    
    chrom, start, end = read.reference_name, read.reference_start, read.reference_end
    
    if not read.mate_is_unmapped and read.is_proper_pair:
        # Process each read once only
        if read.is_read1:
            # mate is present on the same contig
            if read.next_reference_name == chrom:
                # Per pysam API, next_reference_end is not available
                return (chrom, min(start, read.next_reference_start), max(end, read.next_reference_start + 150), read.query_name)
            else:
                return (chrom, start, end, read.query_name)
    elif not read.is_proper_pair and read.is_paired:
        return (chrom, start, end, read.query_name)
    elif not read.is_paired:
        return (chrom, start, end, read.query_name)
    

def calculate_inferred_coverage(bam_file, min_mapq = 10, 
                               filter_tags = [], filter_logic = "and",
                               genome_file = "ucsc.hg19.contigsize.genome",
                               target_region = ""):
    import pybedtools as pb
    import pysam
    
    '''
    This function is intended for identification of candidate realignment regions where template length >> 300bp.
    
    Some expected inputs: 
    filter_tags: a list of tags to search
    filter_logic: i. and: match all filter tags
                 ii.  or: match any one filter tags
                iii. not: not match any filter tags
    genome_file: provided, do not change
    target_region: BED file path of regions of interest
    '''
    
    assert os.path.isfile(genome_file), "Invalid genome file provided. "
    
    # Load regions of interest if provided
    bamfp = pysam.AlignmentFile(bam_file, "rb")
    
    candidate_regions = []
    
    # Case: Valid regions provided
    if os.path.isfile(target_region):
        bed_obj = pb.BedTool(target_region).sort().merge().as_intervalfile()
        next_interval = next(bed_obj)
        try:
            while next_interval:
                for read in bamfp.fetch(next_interval.chrom, next_interval.start, next_interval.end):
                    candidate_region = read_evaluator(read, filter_tags, filter_logic, min_mapq)
                    if candidate_region != None:
                        candidate_regions.append(candidate_region)
                next_interval = next(bed_obj)
        except StopIteration:
            pass
    # Case: No region specified
    else:
        for read in bamfp.fetch():
            candidate_region = read_evaluator(read, filter_tags, filter_logic, min_mapq)
            if candidate_region != None:
                candidate_regions.append(candidate_region)
    
    # Acquire base coverage statistics
    candidate_regions = list(set(candidate_regions))
    genome_coverage = pb.BedTool(candidate_regions).sort().genome_coverage(bg=True, g = genome_file)
    base_coverages = []
    for interval in genome_coverage:
        bases = [(interval.chrom, i, interval.name) for i in range(interval.start, interval.end+1, 1)]
        base_coverages.append(pd.DataFrame.from_records(bases))
    return pd.concat(base_coverages)
    
ret = calculate_inferred_coverage(bamfp, filter_tags = ["XA"], filter_logic = "not", genome_file="work/SDrecall/ucsc.hg19.contigsize.genome")

In [9]:
from read_properties import *

get_bam_frag_size(bamfp)

UnboundLocalError: cannot access local variable 'avg_insert_size' where it is not associated with a value

In [58]:
test = [1, 2, 3]
str(test)

'[1, 2, 3]'