Permalink
Browse files

Starting

New filter to keep pairs of interval together

Responding to some review comments

Java 8-ified test

Updating tests and decided on what to do about non primary alignments

Responding to review comments round 2

Removing an unused import

Forgot to delete out a commented block of code. Ah

Responding to Yossi's review comments
  • Loading branch information...
1 parent 0875b62 commit 702e77f980c8b80aa3491f67c0e35d1d45344a56 @kbergin kbergin committed Jul 7, 2016
@@ -94,6 +94,9 @@ private void advanceInterval() {
* @return true if the SAMRecords matches the filter, otherwise false
*/
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
- throw new UnsupportedOperationException("Paired IntervalFilter filter not implemented!");
+ // This can never be implemented because if the bam is coordinate sorted,
+ // which it has to be for this filter, it will never get both the first and second reads together
+ // and the filterOut method goes in order of the intervals in coordinate order so it will miss reads.
+ throw new UnsupportedOperationException("Paired IntervalFilter filter cannot be implemented, use IntervalKeepPairFilter.");
}
}
@@ -0,0 +1,108 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package htsjdk.samtools.filter;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMUtils;
+import htsjdk.samtools.util.Interval;
+import htsjdk.samtools.util.OverlapDetector;
+
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Filter out SAMRecords where neither record of a pair overlaps a given set of
+ * intervals. If one record of a pair overlaps the interval list, than both are
+ * kept. It is required that the SAMRecords are passed in coordinate order, have
+ * non-null SAMFileHeaders, and that Mate Cigar (MC) is present.
+ *
+ * @author kbergin@broadinstitute.org
+ */
+public class IntervalKeepPairFilter implements SamRecordFilter {
+ private final OverlapDetector<Interval> intervalOverlapDetector;
+
+ /**
+ * Prepare to filter out SAMRecords that do not overlap the given list of
+ * intervals
+ * @param intervals
+ */
+ public IntervalKeepPairFilter(final List<Interval> intervals) {
+ this.intervalOverlapDetector = new OverlapDetector<>(0, 0);
+ this.intervalOverlapDetector.addAll(intervals, intervals);
+ }
+
+ /**
+ * Determines whether a SAMRecord matches this filter. Takes record, finds
+ * the location of its mate using the MC tag. Checks if either record
+ * overlaps the current interval using overlap detector. If yes, return
+ * false -> don't filter it out.
+ *
+ * If a read is secondary or supplementary, filter read out. Use
+ * {@link IntervalFilter} if you want to keep these reads, but NOTE: the
+ * resulting bam may not be valid.
+ *
+ * @param record the SAMRecord to evaluate
+ * @return true if the SAMRecord matches the filter, otherwise false
+ */
+ public boolean filterOut(final SAMRecord record) {
+ if (record.isSecondaryOrSupplementary()) {
+ return true;
+ }
+
+ if (!record.getReadUnmappedFlag()
+ && hasOverlaps(record.getReferenceName(), record.getStart(), record.getEnd())) {
+ return false;
+ }
+
+ return record.getMateUnmappedFlag() || !hasOverlaps(record.getMateReferenceName(),
+ record.getMateAlignmentStart(), SAMUtils.getMateAlignmentEnd(record));
+ }
+
+ /**
+ * Returns true if the record overlaps any intervals in list, false otherwise.
+ *
+ * @param refSequence Reference contig name where record maps
+ * @param start Record alignment start
+ * @param end Record alignment end
+ * @return true if SAMRecord overlaps any intervals in list
+ */
+ private boolean hasOverlaps(final String refSequence, final int start, final int end) {
+ final Interval readInterval = new Interval(refSequence, start, end);
+ final Collection<Interval> overlapsRead = intervalOverlapDetector.getOverlaps(readInterval);
+
+ return !overlapsRead.isEmpty();
+ }
+
+ /**
+ * Determines whether a pair of SAMRecord matches this filter
+ *
+ * @param first the first SAMRecord to evaluate
+ * @param second the second SAMRecord to evaluate
+ *
+ * @return true if both SAMRecords do not overlap the interval list
+ */
+ public boolean filterOut(final SAMRecord first, final SAMRecord second) {
+ return filterOut(first) && filterOut(second);
+ }
+}
@@ -0,0 +1,123 @@
+package htsjdk.samtools.filter;
+
+import htsjdk.samtools.SAMRecordSetBuilder;
+import htsjdk.samtools.util.CollectionUtil;
+import org.testng.Assert;
+import org.testng.annotations.BeforeTest;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import htsjdk.samtools.util.Interval;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.stream.StreamSupport;
+
+public class IntervalKeepPairFilterTest {
+ private static final int READ_LENGTH = 151;
+ private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
+
+ @BeforeTest
+ public void setUp() {
+ builder.setReadLength(READ_LENGTH);
+ // Will be kept when an interval overlaps chromosome 1 in the first 151
+ // bases.
+ builder.addPair("mapped_pair_chr1", 0, 1, 151);
+ // Will be kept when an interval overlaps chromsome 2 in the first 151
+ // bases.
+ builder.addPair("mapped_pair_chr2", 1, 1, 151);
+ // The first read should pass and second should not, but both will
+ // be kept in first test.
+ builder.addPair("one_of_pair", 0, 1, 1000);
+ // The second read is unmapped, but both should be kept in an
+ // interval test where the interval includes chromosome four, where
+ // read one will overlap.
+ builder.addPair("second_mate_unmapped", 3, -1, 1, 1000, false, true,
+ "151M", null, false, false, false, false, -1);
+ // The first read is unmapped but both should be kept in an
+ // interval test where the interval includes chromosome four, where
+ // read two will overlap.
+ builder.addPair("first_mate_unmapped", -1, 3, 1000, 1, true, false,
+ null, "151M", false, false, false, false, -1);
+ // This pair will overlap any interval that includes chromosome 1:1000
+ builder.addPair("prove_one_of_pair", 0, 1000, 1000);
+ // These reads are unmapped and will not map to any intervals, so they
+ // are never kept. This is tested below.
+ builder.addPair("both_unmapped", -1, -1, 1, 1, true, true, null, null,
+ false, false, false, false, -1);
+ // Secondary alignments are never kept by the interval filter.
+ builder.addFrag("mapped_pair_chr1", 0, 1, false, false, "151M", null, -1, true, false);
+ // Supplementary alignment are never kept by the interval filter.
+ builder.addFrag("mapped_pair_chr1", 0, 1, false, false, "151M", null, -1, false, true);
+ }
+
+ @Test(dataProvider = "testData")
+ public void testIntervalPairFilter(final List<Interval> intervals, final long expectedPassingRecords) {
+ final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervals);
+
+ long actualPassingRecords = StreamSupport.stream(builder.spliterator(), false)
+ .filter(rec -> !filter.filterOut(rec))
+ .count();
+
+ Assert.assertEquals(actualPassingRecords, expectedPassingRecords);
+ }
+
+ @Test
+ public void testUnmappedPair() {
+ final List<Interval> intervalList = new ArrayList<>();
+
+ final Interval interval1 = new Interval("chr1", 1, 999);
+ final Interval interval2 = new Interval("chr3", 1, 2);
+ final Interval interval3 = new Interval("chr2", 1, 2);
+ final Interval interval4 = new Interval("chr4", 1, 2);
+
+ intervalList.addAll(CollectionUtil.makeList(interval1, interval2, interval3, interval4));
+
+ final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervalList);
+
+ boolean unmappedPassed = StreamSupport.stream(builder.spliterator(), false)
+ .filter(rec -> !filter.filterOut(rec))
+ .anyMatch(rec -> rec.getReadName().equals("both_unmapped"));
+
+ Assert.assertFalse(unmappedPassed);
+ }
+
+ @Test
+ public void testNotPrimaryReads() {
+ final List<Interval> intervalList = new ArrayList<>();
+ final Interval interval1 = new Interval("chr1", 1, 999);
+ intervalList.add(interval1);
+
+ final IntervalKeepPairFilter filter = new IntervalKeepPairFilter(intervalList);
+
+ boolean notPrimary = StreamSupport.stream(builder.spliterator(), false)
+ .filter(rec -> !filter.filterOut(rec))
+ .anyMatch(rec -> rec.getNotPrimaryAlignmentFlag() || rec.getSupplementaryAlignmentFlag());
+
+ Assert.assertFalse(notPrimary);
+ }
+
+ @DataProvider(name = "testData")
+ private Object[][] testData() {
+ Interval interval = new Interval("chr1", 1, 999);
+ final List<Interval> intervalList_twoPair = new ArrayList<>();
+ intervalList_twoPair.add(interval);
+
+ interval = new Interval("chr3", 1, 2);
+ final List<Interval> intervalList_noMatch = new ArrayList<>();
+ intervalList_noMatch.add(interval);
+
+ interval = new Interval("chr2", 1, 2);
+ final List<Interval> intervalList_onePair = new ArrayList<>();
+ intervalList_onePair.add(interval);
+
+ interval = new Interval("chr4", 1, 2);
+ final List<Interval> intervalList_unmapped = new ArrayList<>();
+ intervalList_unmapped.add(interval);
+
+ return new Object[][]{
+ {intervalList_twoPair, 4},
+ {intervalList_noMatch, 0},
+ {intervalList_onePair, 2},
+ {intervalList_unmapped, 4}
+ };
+ }
+}

0 comments on commit 702e77f

Please sign in to comment.