Permalink
Browse files

adding Hamming Distance functions to StringUtil (#690)

adding hammingDistance and isWithinHammingDistance functions to StringUtils
hamming distance is the number of mismatches between equal length strings
  • Loading branch information...
1 parent b5fd3c0 commit 224cfc1f68f49d63c8d9e72dfc677703379ecd15 @fleharty fleharty committed with lbergelson Sep 20, 2016
@@ -545,4 +545,59 @@ public static int levenshteinDistance(final String string1, final String string2
return i;
}
+
+ /**
+ * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2.
+ * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
+ * the two strings are of different lengths. Hamming distance is case sensitive and does not have
+ * any special treatment for DNA.
+ *
+ * @param s1 The first string to compare
+ * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
+ * @return Hamming distance between s1 and s2.
+ * @throws IllegalArgumentException If the two strings have differing lengths.
+ */
+ public static int hammingDistance(final String s1, final String s2) {
+ if (s1.length() != s2.length()) {
+ throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths. " +
+ "The first string has length " + s1.length() + " and the second string has length " + s2.length() + ".");
+ }
+ int measuredDistance = 0;
+ for (int i = 0;i < s1.length();i++) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ measuredDistance++;
+ }
+ }
+ return measuredDistance;
+ }
+
+ /**
+ * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric.
+ * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if
+ * the two strings are of different lengths. Hamming distance is case sensitive and does not have any
+ * special treatment for DNA.
+ *
+ * @param s1 The first string to compare
+ * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.
+ * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true.
+ * @return true if the two strings are within maxHammingDistance of each other, false otherwise.
+ * @throws IllegalArgumentException If the two strings have differing lengths.
+ */
+ public static boolean isWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance) {
+ if (s1.length() != s2.length()) {
+ throw new IllegalArgumentException("Attempted to determine if two strings of different length were within a specified edit distance.");
+ }
+ int measuredDistance = 0;
+ for (int i = 0;i < s1.length();i++) {
+ if (s1.charAt(i) != s2.charAt(i)) {
+ measuredDistance++;
+ // If the measuredDistance is larger than the maxHammingDistance we can short circuit and return
+ // false, there is no need to continue evaluating the distance.
+ if (measuredDistance > maxHammingDistance) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
}
@@ -67,4 +67,56 @@ public void testSplit(final String input, final String[] expectedResult, final b
{"A:BB:C:", new String[]{"A", "BB", "C:"}, true},
};
}
+
+ @DataProvider(name="withinHammingDistanceProvider")
+ public Object[][] isWithinHammingDistanceProvider() {
+ return new Object[][] {
+ {"ATAC", "GCAT", 3, true},
+ {"ATAC", "GCAT", 2, false},
+ {"ATAC", "GCAT", 1, false},
+ {"ATAC", "GCAT", 0, false}
+ };
+ }
+
+ @Test(dataProvider = "withinHammingDistanceProvider")
+ public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) {
+ Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult);
+ }
+
+ @DataProvider(name="withinHammingDistanceExceptionProvider")
+ public Object[][] isWithinHammingDistanceException() {
+ return new Object[][] {
+ {"ATAC", "GCT" , 3},
+ {"ATAC", "AT" , 2},
+ {"ATAC", "T" , 1},
+ {"" , "GCAT", 0}
+ };
+ }
+
+ @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class)
+ public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) {
+ StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance);
+ }
+
+ @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class)
+ public void testHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) {
+ StringUtil.hammingDistance(s1, s2);
+ }
+
+ @DataProvider(name="hammingDistanceProvider")
+ public Object[][] hammingDistance() {
+ return new Object[][] {
+ {"ATAC" , "GCAT" , 3},
+ {"ATAGC", "ATAGC", 0},
+ {"ATAC" , "atac" , 4}, // Hamming distance is case sensitive.
+ {"" , "" , 0}, // Two empty strings should have Hamming distance of 0.
+ {"nAGTN", "nAGTN", 0} // Ensure that matching Ns are not counted as mismatches.
+ };
+ }
+
+ @Test(dataProvider = "hammingDistanceProvider")
+ public void testHammingDistance(final String s1, final String s2, final int expectedResult) {
+ Assert.assertEquals(StringUtil.hammingDistance(s1, s2), expectedResult);
+ }
+
}

0 comments on commit 224cfc1

Please sign in to comment.