From 702e77f980c8b80aa3491f67c0e35d1d45344a56 Mon Sep 17 00:00:00 2001 From: kbergin Date: Thu, 7 Jul 2016 14:25:01 -0400 Subject: [PATCH] Starting New filter to keep pairs of interval together Responding to some review comments Java 8-ified test Updating tests and decided on what to do about non primary alignments Responding to review comments round 2 Removing an unused import Forgot to delete out a commented block of code. Ah Responding to Yossi's review comments --- .../htsjdk/samtools/filter/IntervalFilter.java | 5 +- .../samtools/filter/IntervalKeepPairFilter.java | 108 ++++++++++++++++++ .../filter/IntervalKeepPairFilterTest.java | 123 +++++++++++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 src/main/java/htsjdk/samtools/filter/IntervalKeepPairFilter.java create mode 100644 src/test/java/htsjdk/samtools/filter/IntervalKeepPairFilterTest.java diff --git a/src/main/java/htsjdk/samtools/filter/IntervalFilter.java b/src/main/java/htsjdk/samtools/filter/IntervalFilter.java index ee3de6d6c..ff3620ae9 100644 --- a/src/main/java/htsjdk/samtools/filter/IntervalFilter.java +++ b/src/main/java/htsjdk/samtools/filter/IntervalFilter.java @@ -94,6 +94,9 @@ private void advanceInterval() { * @return true if the SAMRecords matches the filter, otherwise false */ public boolean filterOut(final SAMRecord first, final SAMRecord second) { - throw new UnsupportedOperationException("Paired IntervalFilter filter not implemented!"); + // This can never be implemented because if the bam is coordinate sorted, + // which it has to be for this filter, it will never get both the first and second reads together + // and the filterOut method goes in order of the intervals in coordinate order so it will miss reads. + throw new UnsupportedOperationException("Paired IntervalFilter filter cannot be implemented, use IntervalKeepPairFilter."); } } diff --git a/src/main/java/htsjdk/samtools/filter/IntervalKeepPairFilter.java b/src/main/java/htsjdk/samtools/filter/IntervalKeepPairFilter.java new file mode 100644 index 000000000..5a7961bbb --- /dev/null +++ b/src/main/java/htsjdk/samtools/filter/IntervalKeepPairFilter.java @@ -0,0 +1,108 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.filter; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMUtils; +import htsjdk.samtools.util.Interval; +import htsjdk.samtools.util.OverlapDetector; + +import java.util.Collection; +import java.util.List; + +/** + * Filter out SAMRecords where neither record of a pair overlaps a given set of + * intervals. If one record of a pair overlaps the interval list, than both are + * kept. It is required that the SAMRecords are passed in coordinate order, have + * non-null SAMFileHeaders, and that Mate Cigar (MC) is present. + * + * @author kbergin@broadinstitute.org + */ +public class IntervalKeepPairFilter implements SamRecordFilter { + private final OverlapDetector intervalOverlapDetector; + + /** + * Prepare to filter out SAMRecords that do not overlap the given list of + * intervals + * @param intervals + */ + public IntervalKeepPairFilter(final List intervals) { + this.intervalOverlapDetector = new OverlapDetector<>(0, 0); + this.intervalOverlapDetector.addAll(intervals, intervals); + } + + /** + * Determines whether a SAMRecord matches this filter. Takes record, finds + * the location of its mate using the MC tag. Checks if either record + * overlaps the current interval using overlap detector. If yes, return + * false -> don't filter it out. + * + * If a read is secondary or supplementary, filter read out. Use + * {@link IntervalFilter} if you want to keep these reads, but NOTE: the + * resulting bam may not be valid. + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches the filter, otherwise false + */ + public boolean filterOut(final SAMRecord record) { + if (record.isSecondaryOrSupplementary()) { + return true; + } + + if (!record.getReadUnmappedFlag() + && hasOverlaps(record.getReferenceName(), record.getStart(), record.getEnd())) { + return false; + } + + return record.getMateUnmappedFlag() || !hasOverlaps(record.getMateReferenceName(), + record.getMateAlignmentStart(), SAMUtils.getMateAlignmentEnd(record)); + } + + /** + * Returns true if the record overlaps any intervals in list, false otherwise. + * + * @param refSequence Reference contig name where record maps + * @param start Record alignment start + * @param end Record alignment end + * @return true if SAMRecord overlaps any intervals in list + */ + private boolean hasOverlaps(final String refSequence, final int start, final int end) { + final Interval readInterval = new Interval(refSequence, start, end); + final Collection overlapsRead = intervalOverlapDetector.getOverlaps(readInterval); + + return !overlapsRead.isEmpty(); + } + + /** + * Determines whether a pair of SAMRecord matches this filter + * + * @param first the first SAMRecord to evaluate + * @param second the second SAMRecord to evaluate + * + * @return true if both SAMRecords do not overlap the interval list + */ + public boolean filterOut(final SAMRecord first, final SAMRecord second) { + return filterOut(first) && filterOut(second); + } +} diff --git a/src/test/java/htsjdk/samtools/filter/IntervalKeepPairFilterTest.java b/src/test/java/htsjdk/samtools/filter/IntervalKeepPairFilterTest.java new file mode 100644 index 000000000..3d30255f5 --- /dev/null +++ b/src/test/java/htsjdk/samtools/filter/IntervalKeepPairFilterTest.java @@ -0,0 +1,123 @@ +package htsjdk.samtools.filter; + +import htsjdk.samtools.SAMRecordSetBuilder; +import htsjdk.samtools.util.CollectionUtil; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import htsjdk.samtools.util.Interval; +import java.util.List; +import java.util.ArrayList; +import java.util.stream.StreamSupport; + +public class IntervalKeepPairFilterTest { + private static final int READ_LENGTH = 151; + private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); + + @BeforeTest + public void setUp() { + builder.setReadLength(READ_LENGTH); + // Will be kept when an interval overlaps chromosome 1 in the first 151 + // bases. + builder.addPair("mapped_pair_chr1", 0, 1, 151); + // Will be kept when an interval overlaps chromsome 2 in the first 151 + // bases. + builder.addPair("mapped_pair_chr2", 1, 1, 151); + // The first read should pass and second should not, but both will + // be kept in first test. + builder.addPair("one_of_pair", 0, 1, 1000); + // The second read is unmapped, but both should be kept in an + // interval test where the interval includes chromosome four, where + // read one will overlap. + builder.addPair("second_mate_unmapped", 3, -1, 1, 1000, false, true, + "151M", null, false, false, false, false, -1); + // The first read is unmapped but both should be kept in an + // interval test where the interval includes chromosome four, where + // read two will overlap. + builder.addPair("first_mate_unmapped", -1, 3, 1000, 1, true, false, + null, "151M", false, false, false, false, -1); + // This pair will overlap any interval that includes chromosome 1:1000 + builder.addPair("prove_one_of_pair", 0, 1000, 1000); + // These reads are unmapped and will not map to any intervals, so they + // are never kept. This is tested below. + builder.addPair("both_unmapped", -1, -1, 1, 1, true, true, null, null, + false, false, false, false, -1); + // Secondary alignments are never kept by the interval filter. + builder.addFrag("mapped_pair_chr1", 0, 1, false, false, "151M", null, -1, true, false); + // Supplementary alignment are never kept by the interval filter. + builder.addFrag("mapped_pair_chr1", 0, 1, false, false, "151M", null, -1, false, true); + } + + @Test(dataProvider = "testData") + public void testIntervalPairFilter(final List intervals, final long expectedPassingRecords) { + final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervals); + + long actualPassingRecords = StreamSupport.stream(builder.spliterator(), false) + .filter(rec -> !filter.filterOut(rec)) + .count(); + + Assert.assertEquals(actualPassingRecords, expectedPassingRecords); + } + + @Test + public void testUnmappedPair() { + final List intervalList = new ArrayList<>(); + + final Interval interval1 = new Interval("chr1", 1, 999); + final Interval interval2 = new Interval("chr3", 1, 2); + final Interval interval3 = new Interval("chr2", 1, 2); + final Interval interval4 = new Interval("chr4", 1, 2); + + intervalList.addAll(CollectionUtil.makeList(interval1, interval2, interval3, interval4)); + + final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervalList); + + boolean unmappedPassed = StreamSupport.stream(builder.spliterator(), false) + .filter(rec -> !filter.filterOut(rec)) + .anyMatch(rec -> rec.getReadName().equals("both_unmapped")); + + Assert.assertFalse(unmappedPassed); + } + + @Test + public void testNotPrimaryReads() { + final List intervalList = new ArrayList<>(); + final Interval interval1 = new Interval("chr1", 1, 999); + intervalList.add(interval1); + + final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervalList); + + boolean notPrimary = StreamSupport.stream(builder.spliterator(), false) + .filter(rec -> !filter.filterOut(rec)) + .anyMatch(rec -> rec.getNotPrimaryAlignmentFlag() || rec.getSupplementaryAlignmentFlag()); + + Assert.assertFalse(notPrimary); + } + + @DataProvider(name = "testData") + private Object[][] testData() { + Interval interval = new Interval("chr1", 1, 999); + final List intervalList_twoPair = new ArrayList<>(); + intervalList_twoPair.add(interval); + + interval = new Interval("chr3", 1, 2); + final List intervalList_noMatch = new ArrayList<>(); + intervalList_noMatch.add(interval); + + interval = new Interval("chr2", 1, 2); + final List intervalList_onePair = new ArrayList<>(); + intervalList_onePair.add(interval); + + interval = new Interval("chr4", 1, 2); + final List intervalList_unmapped = new ArrayList<>(); + intervalList_unmapped.add(interval); + + return new Object[][]{ + {intervalList_twoPair, 4}, + {intervalList_noMatch, 0}, + {intervalList_onePair, 2}, + {intervalList_unmapped, 4} + }; + } +}