diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index ecb1b3f49..90492533e 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -545,4 +545,59 @@ public static int levenshteinDistance(final String string1, final String string2 return i; } + + /** + * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2. + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if + * the two strings are of different lengths. Hamming distance is case sensitive and does not have + * any special treatment for DNA. + * + * @param s1 The first string to compare + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. + * @return Hamming distance between s1 and s2. + * @throws IllegalArgumentException If the two strings have differing lengths. + */ + public static int hammingDistance(final String s1, final String s2) { + if (s1.length() != s2.length()) { + throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths. " + + "The first string has length " + s1.length() + " and the second string has length " + s2.length() + "."); + } + int measuredDistance = 0; + for (int i = 0;i < s1.length();i++) { + if (s1.charAt(i) != s2.charAt(i)) { + measuredDistance++; + } + } + return measuredDistance; + } + + /** + * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric. + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if + * the two strings are of different lengths. Hamming distance is case sensitive and does not have any + * special treatment for DNA. + * + * @param s1 The first string to compare + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. + * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true. + * @return true if the two strings are within maxHammingDistance of each other, false otherwise. + * @throws IllegalArgumentException If the two strings have differing lengths. + */ + public static boolean isWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance) { + if (s1.length() != s2.length()) { + throw new IllegalArgumentException("Attempted to determine if two strings of different length were within a specified edit distance."); + } + int measuredDistance = 0; + for (int i = 0;i < s1.length();i++) { + if (s1.charAt(i) != s2.charAt(i)) { + measuredDistance++; + // If the measuredDistance is larger than the maxHammingDistance we can short circuit and return + // false, there is no need to continue evaluating the distance. + if (measuredDistance > maxHammingDistance) { + return false; + } + } + } + return true; + } } diff --git a/src/test/java/htsjdk/samtools/util/StringUtilTest.java b/src/test/java/htsjdk/samtools/util/StringUtilTest.java index 91e8792f4..dbb2a0709 100644 --- a/src/test/java/htsjdk/samtools/util/StringUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/StringUtilTest.java @@ -67,4 +67,56 @@ public void testSplit(final String input, final String[] expectedResult, final b {"A:BB:C:", new String[]{"A", "BB", "C:"}, true}, }; } + + @DataProvider(name="withinHammingDistanceProvider") + public Object[][] isWithinHammingDistanceProvider() { + return new Object[][] { + {"ATAC", "GCAT", 3, true}, + {"ATAC", "GCAT", 2, false}, + {"ATAC", "GCAT", 1, false}, + {"ATAC", "GCAT", 0, false} + }; + } + + @Test(dataProvider = "withinHammingDistanceProvider") + public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); + } + + @DataProvider(name="withinHammingDistanceExceptionProvider") + public Object[][] isWithinHammingDistanceException() { + return new Object[][] { + {"ATAC", "GCT" , 3}, + {"ATAC", "AT" , 2}, + {"ATAC", "T" , 1}, + {"" , "GCAT", 0} + }; + } + + @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) + public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { + StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance); + } + + @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) + public void testHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { + StringUtil.hammingDistance(s1, s2); + } + + @DataProvider(name="hammingDistanceProvider") + public Object[][] hammingDistance() { + return new Object[][] { + {"ATAC" , "GCAT" , 3}, + {"ATAGC", "ATAGC", 0}, + {"ATAC" , "atac" , 4}, // Hamming distance is case sensitive. + {"" , "" , 0}, // Two empty strings should have Hamming distance of 0. + {"nAGTN", "nAGTN", 0} // Ensure that matching Ns are not counted as mismatches. + }; + } + + @Test(dataProvider = "hammingDistanceProvider") + public void testHammingDistance(final String s1, final String s2, final int expectedResult) { + Assert.assertEquals(StringUtil.hammingDistance(s1, s2), expectedResult); + } + }