|
|
@@ -46,8 +46,8 @@ |
|
|
private static enum Attr { DuplicateScore }
|
|
|
|
|
|
/** Calculates a score for the read which is the sum of scores over Q15. */
|
|
|
- private static short getSumOfBaseQualities(final SAMRecord rec) {
|
|
|
- short score = 0;
|
|
|
+ private static int getSumOfBaseQualities(final SAMRecord rec) {
|
|
|
+ int score = 0;
|
|
|
for (final byte b : rec.getBaseQualities()) {
|
|
|
if (b >= 15) score += b;
|
|
|
}
|
|
|
@@ -64,6 +64,8 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS |
|
|
|
|
|
/**
|
|
|
* Returns the duplicate score computed from the given fragment.
|
|
|
+ * value should be capped by Short.MAX_VALUE/2 since the score from two reads will be
|
|
|
+ * added and an overflow will be
|
|
|
*
|
|
|
* If true is given to assumeMateCigar, then any score that can use the mate cigar to compute the mate's score will return the score
|
|
|
* computed on both ends.
|
|
|
@@ -72,24 +74,40 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS |
|
|
Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore);
|
|
|
|
|
|
if (storedScore == null) {
|
|
|
- short score = 0;
|
|
|
-
|
|
|
+ short score=0;
|
|
|
switch (scoringStrategy) {
|
|
|
case SUM_OF_BASE_QUALITIES:
|
|
|
- score += getSumOfBaseQualities(record);
|
|
|
+ // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
|
|
|
+ // and risk overflow.
|
|
|
+ score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2);
|
|
|
break;
|
|
|
case TOTAL_MAPPED_REFERENCE_LENGTH:
|
|
|
if (!record.getReadUnmappedFlag()) {
|
|
|
- score += record.getCigar().getReferenceLength();
|
|
|
+ // no need to remember the score since this scoring mechanism is symmetric
|
|
|
+ score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2);
|
|
|
}
|
|
|
if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
|
|
|
- score += SAMUtils.getMateCigar(record).getReferenceLength();
|
|
|
+ score += (short) Math.min(SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2);
|
|
|
}
|
|
|
break;
|
|
|
+ // The RANDOM score gives the same score to both reads so that they get filtered together.
|
|
|
+ // it's not critical do use the readName since the scores from both ends get added, but it seem
|
|
|
+ // to be clearer this way.
|
|
|
case RANDOM:
|
|
|
- score += (short) (hasher.hashUnencodedChars(record.getReadName()) >> 16);
|
|
|
+ // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4
|
|
|
+ score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111);
|
|
|
+ // subtract Short.MIN_VALUE/4 from it to end up with a number between
|
|
|
+ // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
|
|
|
+ // not passing filters. We need to stay far from overflow so that when we add the two
|
|
|
+ // scores from the two read mates we do not overflow since that could cause us to chose a
|
|
|
+ // failing read-pair instead of a passing one.
|
|
|
+ score -= Short.MIN_VALUE / 4;
|
|
|
}
|
|
|
|
|
|
+ // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
|
|
|
+ // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
|
|
|
+ score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0;
|
|
|
+
|
|
|
storedScore = score;
|
|
|
record.setTransientAttribute(Attr.DuplicateScore, storedScore);
|
|
|
}
|
|
|
@@ -125,7 +143,7 @@ public static int compare(final SAMRecord rec1, final SAMRecord rec2, final Scor |
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Compare two records based on their duplicate scores. The duplicate scores for each record is assume to be
|
|
|
+ * Compare two records based on their duplicate scores. The duplicate scores for each record is assumed to be
|
|
|
* pre-computed by computeDuplicateScore and stored in the "DS" tag. If the scores are equal, we break
|
|
|
* ties based on mapping quality (added to the mate's mapping quality if paired and mapped), then library/read name.
|
|
|
*
|
|
|
|