Skip to content

Commit

Permalink
Merge pull request #469 from scireum/feature/aha/xml-tools
Browse files Browse the repository at this point in the history
Moves some XML/HTML tools from sirius-web to kernel
  • Loading branch information
Christian Schierle committed Jul 4, 2023
2 parents 6764fd6 + 5c9beb7 commit 3197103
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 0 deletions.
80 changes: 80 additions & 0 deletions src/main/java/sirius/kernel/commons/StringCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
package sirius.kernel.commons;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.UnaryOperator;
Expand All @@ -28,6 +31,7 @@ public class StringCleanup {
private static final Pattern PATTERN_NON_ALPHA_NUMERIC = Pattern.compile("([^\\p{L}\\d])");
private static final Pattern PATTERN_NON_LETTER = Pattern.compile("\\P{L}");
private static final Pattern PATTERN_NON_DIGIT = Pattern.compile("\\D");
private static final Pattern STRIP_XML_REGEX = Pattern.compile("\\s*</?[a-zA-Z0-9]+[^>]*>\\s*");

private static final Map<Integer, String> unicodeMapping = new TreeMap<>();

Expand Down Expand Up @@ -338,4 +342,80 @@ public static String reduceCharacters(String term) {

return result == null ? term : result.toString();
}

/**
* Replaces XML tags by a single whitespace character.
* <p>
* Most probably this should be followed by {@link #reduceWhitespace(String)} and also
* {@link StringCleanup#trim(String)}
*
* @param input the input to process
* @return the resulting string
*/
public static String replaceXml(String input) {
if (Strings.isEmpty(input)) {
return input;
}

String alreadyStrippedContent = input;
String contentToStrip;
do {
contentToStrip = alreadyStrippedContent;
alreadyStrippedContent = STRIP_XML_REGEX.matcher(contentToStrip).replaceFirst(" ");
} while (!Strings.areEqual(contentToStrip, alreadyStrippedContent));

return alreadyStrippedContent;
}

/**
* Escapes XML characters to that the given string can be safely embedded in XML.
*
* @param input the input to process
* @return the resulting string
*/
public static String escapeXml(@Nullable String input) {
if (Strings.isEmpty(input)) {
return "";
}

final StringBuilder result = new StringBuilder();
final StringCharacterIterator iterator = new StringCharacterIterator(input);
char character = iterator.current();
while (character != CharacterIterator.DONE) {
if (character == '<') {
result.append("&lt;");
} else if (character == '>') {
result.append("&gt;");
} else if (character == '\"') {
result.append("&quot;");
} else if (character == '\'') {
result.append("&#039;");
} else if (character == '&') {
result.append("&amp;");
} else {
// the char is not a special one
// add it to the result as is
result.append(character);
}
character = iterator.next();
}

return result.toString();
}

/**
* Provides a very simplistic approach to convert newlines to HTML line breaks.
* <p>
* Note that most modern browsers will probably be better off by using a CSS "whitespace" setting, but some
* old html renderers need raw br tags to properly render.
*
* @param input the input to process
* @return the resulting string
*/
public static String nlToBr(String input) {
if (input == null) {
return null;
}
return input.replace("\n", " <br> ");
}
}
16 changes: 16 additions & 0 deletions src/main/java/sirius/kernel/commons/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,22 @@ public static String cleanup(@Nullable String inputString, @Nonnull Iterable<Una
return value;
}

private static final Pattern DETECT_XML_REGEX = Pattern.compile("<[a-zA-Z0-9]+[^>]*>");

/**
* Determines if the given content contains XML tags.
*
* @param content the content to check
* @return <tt>true</tt> if XML tags were found, <tt>false</tt> otherwise
*/
public static boolean probablyContainsXml(@Nullable String content) {
if (Strings.isEmpty(content)) {
return false;
}

return DETECT_XML_REGEX.matcher(content).find();
}

/**
* Removes all umlauts and other decorated latin characters.
*
Expand Down
16 changes: 16 additions & 0 deletions src/test/java/sirius/kernel/commons/StringsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,22 @@ void cleanup() {
assertEquals("HELLO", Strings.cleanup("Héllo", StringCleanup::reduceCharacters, StringCleanup::uppercase));
assertEquals("Hello", Strings.cleanup("Hel-lo", StringCleanup::removePunctuation));
assertEquals("Hello", Strings.cleanup("\10Hello", StringCleanup::removeControlCharacters));
assertEquals("Test", Strings.cleanup("<b>Test</b>", StringCleanup::replaceXml, StringCleanup::trim));
assertEquals("Test", Strings.cleanup("<b>Test</b>", StringCleanup::replaceXml, StringCleanup::trim));
assertEquals("Test", Strings.cleanup("<b>Test<br><img /></b>", StringCleanup::replaceXml, StringCleanup::trim));
assertEquals("Test Blubb", Strings.cleanup("<b>Test<br><img />Blubb</b>", StringCleanup::replaceXml, StringCleanup::trim));
assertEquals("foo having < 3 m, with >= 3 m", Strings.cleanup("foo having < 3 m, with >= 3 m", StringCleanup::replaceXml, StringCleanup::trim));
assertEquals("&lt;b&gt;Foo &lt;br /&gt; Bar&lt;/b&gt;", Strings.cleanup("<b>Foo <br /> Bar</b>", StringCleanup::escapeXml));
assertEquals("Hello <br> World", Strings.cleanup("Hello\nWorld", StringCleanup::nlToBr));
}

@Test
void probablyContainsXml() {
assertTrue(Strings.probablyContainsXml("<b>Test</b>"));
assertTrue(Strings.probablyContainsXml("<br>"));
assertTrue(Strings.probablyContainsXml("<br />"));
assertTrue(Strings.probablyContainsXml("<br test=\"foo\">"));
assertFalse(Strings.probablyContainsXml("foo having < 3 m, with >= 3 m"));
}

@Test
Expand Down

0 comments on commit 3197103

Please sign in to comment.