Adds a function isWithinHammingDistance that checks if two strings ar… #690

Merged
merged 4 commits into from Sep 20, 2016
@@ -545,4 +545,59 @@ public static int levenshteinDistance(final String string1, final String string2
return i;
}
+
+ /**
+ * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2.
+ * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
+ * the two strings are of different lengths. Hamming distance is case sensitive and does not have
+ * any special treatment for DNA.
+ *
+ * @param s1 The first string to compare
+ * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
+ * @return Hamming distance between s1 and s2.
+ * @throws IllegalArgumentException If the two strings have differing lengths.
+ */
+ public static int hammingDistance(final String s1, final String s2) {
@nh13

nh13 Aug 25, 2016

Contributor

Since likely this is going to be used to compare DNA sequences, you probably want in the doc stating that the comparison is case sensitive. Also, do you have a need to count or not count Ns as mismatches?

@fleharty

fleharty Aug 25, 2016

Contributor

@nh13 Added comment about case sensitive.
I'm also adding a note that matching Ns will not be counted as mismatches, since that isn't desired here, but I can see that someone else might want that.

@lbergelson

lbergelson Sep 15, 2016

Contributor

@fleharty Sorry for the very slow response. I think this note about N's is more confusing that it is clarifying. I would say something more like "This implementation is case sensitive and does not have any special treatment for DNA."

+ if (s1.length() != s2.length()) {
+ throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths. " +
+ "The first string has length " + s1.length() + " and the second string has length " + s2.length() + ".");
+ }
+ int measuredDistance = 0;
+ for (int i = 0;i < s1.length();i++) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ measuredDistance++;
+ }
+ }
+ return measuredDistance;
+ }
+
+ /**
+ * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric.
+ * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
+ * the two strings are of different lengths. Hamming distance is case sensitive and does not have any
+ * special treatment for DNA.
+ *
+ * @param s1 The first string to compare
+ * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
+ * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true.
+ * @return true if the two strings are within maxHammingDistance of each other, false otherwise.
+ * @throws IllegalArgumentException If the two strings have differing lengths.
+ */
@nh13

nh13 Aug 25, 2016

Contributor

Ditto about the doc usage for DNA bases.

@fleharty

fleharty Aug 25, 2016

Contributor

fixed

+ public static boolean isWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance) {
+ if (s1.length() != s2.length()) {
+ throw new IllegalArgumentException("Attempted to determine if two strings of different length were within a specified edit distance.");
+ }
+ int measuredDistance = 0;
+ for (int i = 0;i < s1.length();i++) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ measuredDistance++;
+ // If the measuredDistance is larger than the maxHammingDistance we can short circuit and return
+ // false, there is no need to continue evaluating the distance.
+ if (measuredDistance > maxHammingDistance) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
}
@@ -67,4 +67,56 @@ public void testSplit(final String input, final String[] expectedResult, final b
{"A:BB:C:", new String[]{"A", "BB", "C:"}, true},
};
}
+
+ @DataProvider(name="withinHammingDistanceProvider")
+ public Object[][] isWithinHammingDistanceProvider() {
+ return new Object[][] {
+ {"ATAC", "GCAT", 3, true},
+ {"ATAC", "GCAT", 2, false},
+ {"ATAC", "GCAT", 1, false},
+ {"ATAC", "GCAT", 0, false}
+ };
+ }
+
+ @Test(dataProvider = "withinHammingDistanceProvider")
+ public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) {
+ Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult);
+ }
+
+ @DataProvider(name="withinHammingDistanceExceptionProvider")
+ public Object[][] isWithinHammingDistanceException() {
+ return new Object[][] {
+ {"ATAC", "GCT" , 3},
+ {"ATAC", "AT" , 2},
+ {"ATAC", "T" , 1},
+ {"" , "GCAT", 0}
+ };
+ }
+
+ @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class)
+ public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) {
+ StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance);
+ }
+
+ @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class)
+ public void testHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) {
+ StringUtil.hammingDistance(s1, s2);
+ }
+
+ @DataProvider(name="hammingDistanceProvider")
+ public Object[][] hammingDistance() {
+ return new Object[][] {
+ {"ATAC" , "GCAT" , 3},
+ {"ATAGC", "ATAGC", 0},
+ {"ATAC" , "atac" , 4}, // Hamming distance is case sensitive.
+ {"" , "" , 0}, // Two empty strings should have Hamming distance of 0.
+ {"nAGTN", "nAGTN", 0} // Ensure that matching Ns are not counted as mismatches.
+ };
+ }
+
+ @Test(dataProvider = "hammingDistanceProvider")
+ public void testHammingDistance(final String s1, final String s2, final int expectedResult) {
+ Assert.assertEquals(StringUtil.hammingDistance(s1, s2), expectedResult);
+ }
+
}