Skip to content

Commit

Permalink
Working on escape/unescape routine.
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Aug 2, 2010
1 parent 6bde6c8 commit 8bb490a
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 38 deletions.
34 changes: 34 additions & 0 deletions src/main/java/org/jsoup/nodes/Document.java
Expand Up @@ -5,6 +5,7 @@

import java.util.List;
import java.util.ArrayList;
import java.nio.charset.Charset;

/**
A HTML Document.
Expand Down Expand Up @@ -148,5 +149,38 @@ public Element text(String text) {
public String nodeName() {
return "#document";
}

/**
* A Document's output settings control the form of the text() and html() methods.
*/
public static class OutputSettings {
private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
private Charset charset = Charset.forName("UTF-8");

public OutputSettings() {}

public Entities.EscapeMode escapeMode() {
return escapeMode;
}

public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
this.escapeMode = escapeMode;
return this;
}

public Charset charset() {
return charset;
}

public OutputSettings charset(Charset charset) {
this.charset = charset;
return this;
}

public OutputSettings charset(String charset) {
this.charset = Charset.forName(charset);
return this;
}
}
}

@@ -1,4 +1,6 @@
package org.jsoup;
package org.jsoup.nodes;

import org.jsoup.parser.TokenQueue;

import java.util.HashMap;
import java.util.Map;
Expand All @@ -13,31 +15,88 @@
* Draft implementation. Do not consume.
*/
class Entities {
static final Map<String, Integer> base;
static final Map<String, Integer> full;
static final Map<Integer, String> fullByVal;
public enum EscapeMode {
base, extended
}

static String escape(String string, Charset charset) {
// todo: this needs option to: use base names only (with numerical as fallback)
StringBuilder accum = new StringBuilder((int) (string.length() * 1.5));
CharsetEncoder encoder = charset.newEncoder();
private static final Map<String, Character> base;
private static final Map<String, Character> full;
private static final Map<Character, String> baseByVal;
private static final Map<Character, String> fullByVal;

static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) {
StringBuilder accum = new StringBuilder(string.length() * 2);
Map<Character, String> map = escapeMode == EscapeMode.extended ? fullByVal : baseByVal;

for (int pos = 0; pos < string.length(); pos++) {
Character c = string.charAt(pos);
if (fullByVal.containsKey((int) c))
accum.append("&").append(fullByVal.get((int) c)).append(";");
if (map.containsKey(c))
accum.append("&").append(map.get(c)).append(";");
else if (encoder.canEncode(c))
accum.append(c);
else
accum.append("&#").append((int)c).append(";");
accum.append("&#").append((int) c).append(";");
}

return accum.toString();
}

static String unescape(String string) {
if (!string.contains("&"))
return string;

StringBuilder accum = new StringBuilder(string.length());
TokenQueue cq = new TokenQueue(string);

// formats dealt with: [&amp] (no semi), [&amp;], [&#123;] (int), &#
while (!cq.isEmpty()) {
accum.append(cq.consumeTo("&"));
if (!cq.matches("&")) { // ran to end
accum.append(cq.remainder());
break;
}
cq.advance(); // past &
String val;
int charval = -1;

boolean isNum = false;
if (cq.matches("#")) {
isNum = true;
cq.consume();
}
val = cq.consumeWord(); // and num!
if (val.length() == 0) {
accum.append("&");
continue;
}
if (cq.matches(";"))
cq.advance();

if (isNum) {
try {
if (val.charAt(0) == 'x' || val.charAt(0) == 'X')
charval = Integer.valueOf(val.substring(1), 16);
else
charval = Integer.valueOf(val, 10);
} catch (NumberFormatException e) {
// skip
}
} else {
if (full.containsKey(val.toLowerCase()))
charval = full.get(val.toLowerCase());
}
if (charval == -1 || charval > 0xFFFF) // out of range
accum.append("&").append(val).append(";");
else
accum.append((char) charval);
}

return accum.toString();
}

// base entities can be unescaped without trailing ;
// most common, base entities can be unescaped without trailing ;
// e.g. &amp
static final Object[][] baseArray = {
private static final Object[][] baseArray = {
{"AElig", 0x000C6},
{"AMP", 0x00026},
{"Aacute", 0x000C1},
Expand Down Expand Up @@ -146,7 +205,9 @@ else if (encoder.canEncode(c))
{"yuml", 0x000FF}
};

static final Object[][] fullArray = {
// in most situations, will be better to use UTF8 and use the character directly, or use the numerical escape.
// are people really likely to remember "&CounterClockwiseContourIntegral;"? good grief.
private static final Object[][] fullArray = {
{"AElig", 0x000C6},
{"AMP", 0x00026},
{"Aacute", 0x000C1},
Expand Down Expand Up @@ -2182,14 +2243,20 @@ else if (encoder.canEncode(c))
};

static {
base = new HashMap<String, Integer>(baseArray.length);
full = new HashMap<String, Integer>(fullArray.length);
fullByVal = new HashMap<Integer, String>(fullArray.length);
base = new HashMap<String, Character>(baseArray.length);
full = new HashMap<String, Character>(fullArray.length);
baseByVal = new HashMap<Character, String>(baseArray.length);
fullByVal = new HashMap<Character, String>(fullArray.length);

for (Object[] entity : baseArray) base.put(((String) entity[0]).toLowerCase(), (Integer) entity[1]);
for (Object[] entity : baseArray) {
Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
base.put((String) entity[0], c);
baseByVal.put(c, ((String) entity[0]).toLowerCase());
}
for (Object[] entity : fullArray) {
full.put(((String) entity[0]).toLowerCase(), (Integer) entity[1]);
fullByVal.put((Integer) entity[1], ((String) entity[0]));
Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
full.put((String) entity[0], c);
fullByVal.put(c, ((String) entity[0]).toLowerCase());
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/main/java/org/jsoup/parser/TokenQueue.java
Expand Up @@ -122,6 +122,13 @@ public boolean matchesWord() {
return !isEmpty() && Character.isLetterOrDigit(peek());
}

/**
* Drops the next character off the queue.
*/
public void advance() {
if (!isEmpty()) pos++;
}

/**
* Consume one character off queue.
* @return first character on queue.
Expand Down
18 changes: 0 additions & 18 deletions src/test/java/org/jsoup/EntitiesTest.java

This file was deleted.

30 changes: 30 additions & 0 deletions src/test/java/org/jsoup/nodes/EntitiesTest.java
@@ -0,0 +1,30 @@
package org.jsoup.nodes;

import org.junit.Test;

import static org.junit.Assert.*;
import org.jsoup.nodes.Entities;

import java.nio.charset.Charset;

public class EntitiesTest {
@Test public void escape() {
String text = "Hello &<> Å π 新 there";
String escapedAscii = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.base);
String escapedAsciiFull = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.extended);
String escapedUtf = Entities.escape(text, Charset.forName("UTF-8").newEncoder(), Entities.EscapeMode.base);

assertEquals("Hello &amp;&lt;&gt; &aring; &#960; &#26032; there", escapedAscii);
assertEquals("Hello &amp;&lt;&gt; &angst; &pi; &#26032; there", escapedAsciiFull);
assertEquals("Hello &amp;&lt;&gt; &aring; π 新 there", escapedUtf);
// odd that it's defined as aring in base but angst in full
}

@Test public void unescape() {
String text = "Hello &amp;&LT&gt; &ANGST &#960; &#960 &#x65B0; there &!";
assertEquals("Hello &<> Å π π 新 there &!", Entities.unescape(text));

assertEquals("&0987654321; &unknown;", Entities.unescape("&0987654321; &unknown"));
// these missed encodes aren't 100%, not sure how correct the fallback need to be
}
}

0 comments on commit 8bb490a

Please sign in to comment.