diff --git a/CHANGES b/CHANGES index 06e6ff5790..46f2f791aa 100644 --- a/CHANGES +++ b/CHANGES @@ -1,14 +1,21 @@ jsoup changelog -*** Release 0.2.0 (pending) +*** Release 0.2.2 (2010-Feb-07) + * jsoup packages are now available in the Maven central repository. + + * New feature: supports Element#addClass, removeClass, toggleClass; + also collection class methods on Elements. * New feature: supports Element#wrap(html) and Elements#wrap(html). * New selector syntax: supports E + F adjacent sibling selector * New selector systax: supports E ~ F preceding sibling selector * New: supports Element#elementSiblingIndex() + + * Improved document normalisation. + * Improved HTML string output format (pretty-print) * Fixed absolute URL resolution issue when a base tag has no href. -*** Release 0.1.2 (2010-Jan-02) +*** Release 0.1.2 (2010-Feb-02) * Fixed unrecognised tag handler to be more permissive diff --git a/pom.xml b/pom.xml index d555eefb8a..79bba7cad8 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.jsoup jsoup - 0.1.3-SNAPSHOT + 0.2.3-SNAPSHOT jsoup HTML parser http://jsoup.org/ 2009 @@ -76,6 +76,48 @@ + + + + sonatype-nexus-snapshots + Sonatype Nexus Snapshots + http://oss.sonatype.org/content/repositories/snapshots + + + sonatype-nexus-staging + Nexus Release Repository + http://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + + release-sign-artifacts + + + performRelease + true + + + + + + org.apache.maven.plugins + maven-gpg-plugin + + + sign-artifacts + verify + + sign + + + + + + + + @@ -103,4 +145,16 @@ UTF-8 + + + jhy + Jonathan Hedley + jonathan@hedley.net + + Lead Developer + + +11 + + + \ No newline at end of file diff --git a/src/main/java/org/jsoup/nodes/Comment.java b/src/main/java/org/jsoup/nodes/Comment.java index 193f3b51a8..8bf392791b 100644 --- a/src/main/java/org/jsoup/nodes/Comment.java +++ b/src/main/java/org/jsoup/nodes/Comment.java @@ -30,6 +30,7 @@ public String getData() { } void outerHtml(StringBuilder accum) { + indent(accum); accum.append(String.format("", getData())); } diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java index 5f96855ddc..1e17bcbe0d 100644 --- a/src/main/java/org/jsoup/nodes/Document.java +++ b/src/main/java/org/jsoup/nodes/Document.java @@ -3,6 +3,9 @@ import org.apache.commons.lang.Validate; import org.jsoup.parser.Tag; +import java.util.List; +import java.util.ArrayList; + /** A HTML Document. @@ -84,6 +87,44 @@ public Element createElement(String tagName) { return new Element(Tag.valueOf(tagName), this.baseUri()); } + /** + Normalise the document. This happens after the parse phase so generally does not need to be called. + Moves any text content that is not in the body element into the body. + @return this document after normalisation + */ + public Document normalise() { + if (select("html").isEmpty()) + appendElement("html"); + if (head() == null) + select("html").first().appendElement("head"); + if (body() == null) + select("html").first().appendElement("body"); + + normalise(this); + normalise(select("html").first()); + normalise(head()); + + return this; + } + + // does not recurse. the result order isn't great here (not intuitive); they are in the body though. + private void normalise(Element element) { + List toMove = new ArrayList(); + for (Node node: element.childNodes) { + if (node instanceof TextNode) { + TextNode tn = (TextNode) node; + if (!tn.isBlank()) + toMove.add(tn); + } + } + + for (Node node: toMove) { + element.removeChild(node); + body().appendChild(new TextNode(" ", "")); + body().appendChild(node); + } + } + @Override public String outerHtml() { return super.html(); // no outer wrapper tag diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index be021f8f19..e16a1ffbca 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1,6 +1,7 @@ package org.jsoup.nodes; import org.apache.commons.lang.Validate; +import org.apache.commons.lang.StringUtils; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.select.Collector; @@ -571,6 +572,25 @@ public Element text(String text) { return this; } + /** + Test if this element has any text content (that is not just whitespace). + @return true if element has non-blank text content. + */ + public boolean hasText() { + for (Node child: childNodes) { + if (child instanceof TextNode) { + TextNode textNode = (TextNode) child; + if (!textNode.isBlank()) + return true; + } else if (child instanceof Element) { + Element el = (Element) child; + if (el.hasText()) + return true; + } + } + return false; + } + /** * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. * @return the data, or empty string if none @@ -602,17 +622,29 @@ public String className() { /** * Get all of the element's class names. E.g. on element {@code
}, - * returns a set of two elements {@code "header", "gray"}. + * returns a set of two elements {@code "header", "gray"}. Note that modifications to this set are not pushed to + * the backing {@code class} attribute; use the {@link #classNames(java.util.Set)} method to persist them. * @return set of classnames, empty if no class attribute */ public Set classNames() { if (classNames == null) { String[] names = className().split("\\s+"); - classNames = new HashSet(Arrays.asList(names)); + classNames = new LinkedHashSet(Arrays.asList(names)); } return classNames; } + /** + Set the element's {@code class} attribute to the supplied class names. + @param classNames set of classes + @return this element, for chaining + */ + public Element classNames(Set classNames) { + Validate.notNull(classNames); + attributes.put("class", StringUtils.join(classNames, " ")); + return this; + } + /** * Tests if this element has a class. * @param className name of class to check for @@ -622,8 +654,57 @@ public boolean hasClass(String className) { return classNames().contains(className); } + /** + Add a class name to this element's {@code class} attribute. + @param className class name to add + @return this element + */ + public Element addClass(String className) { + Validate.notNull(className); + + Set classes = classNames(); + classes.add(className); + classNames(classes); + + return this; + } + + /** + Remove a class name from this element's {@code class} attribute. + @param className class name to remove + @return this element + */ + public Element removeClass(String className) { + Validate.notNull(className); + + Set classes = classNames(); + classes.remove(className); + classNames(classes); + + return this; + } + + /** + Toggle a class name on this element's {@code class} attribute: if present, remove it; otherwise add it. + @param className class name to toggle + @return this element + */ + public Element toggleClass(String className) { + Validate.notNull(className); + + Set classes = classNames(); + if (classes.contains(className)) + classes.remove(className); + else + classes.add(className); + classNames(classes); + + return this; + } + void outerHtml(StringBuilder accum) { - + if (isBlock() || (parent() != null && parent().tag().canContainBlock() && siblingIndex() == 0)) + indent(accum); accum .append("<") .append(tagName()) @@ -633,12 +714,9 @@ void outerHtml(StringBuilder accum) { accum.append(" />"); } else { accum.append(">"); - if (tag.canContainBlock()) - accum.append("\n"); html(accum); + if (tag.canContainBlock()) indent(accum); accum.append(""); - if (tag.isBlock()) - accum.append("\n"); } } diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java index 36cdddcd24..0e09509a80 100644 --- a/src/main/java/org/jsoup/nodes/Node.java +++ b/src/main/java/org/jsoup/nodes/Node.java @@ -2,6 +2,7 @@ import org.apache.commons.lang.NotImplementedException; import org.apache.commons.lang.Validate; +import org.apache.commons.lang.StringUtils; import java.net.MalformedURLException; import java.net.URL; @@ -222,6 +223,13 @@ protected void addChild(Node in) { childNodes.add(in); in.parentNode = this; } + + protected int nodeDepth() { + if (parentNode == null) + return 0; + else + return parentNode.nodeDepth() + 1; + } /** Retrieves this node's sibling nodes. Effectively, {@link #childNodes() node.parent.childNodes()}. @@ -259,6 +267,16 @@ public Node previousSibling() { return null; } + /** + * Get the list index of this node in its node sibling list. I.e. if this is the first node + * sibling, returns 0. + * @return position in node sibling list + * @see org.jsoup.nodes.Element#elementSiblingIndex() + */ + public Integer siblingIndex() { + return indexInList(this, parent().childNodes); + } + protected static Integer indexInList(N search, List nodes) { Validate.notNull(search); Validate.notNull(nodes); @@ -291,6 +309,10 @@ public String toString() { return outerHtml(); } + protected void indent(StringBuilder accum) { + accum.append("\n").append(StringUtils.leftPad("", nodeDepth() -1 * 2)); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/src/main/java/org/jsoup/nodes/TextNode.java b/src/main/java/org/jsoup/nodes/TextNode.java index 6861cf0442..f88c5d2059 100644 --- a/src/main/java/org/jsoup/nodes/TextNode.java +++ b/src/main/java/org/jsoup/nodes/TextNode.java @@ -2,6 +2,7 @@ import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.Validate; +import org.apache.commons.lang.StringUtils; /** A text node. @@ -34,12 +35,22 @@ public String getWholeText() { return attributes.get(TEXT_KEY); } + /** + Test if this text node is blank -- that is, empty or only whitespace (including newlines). + @return true if this document is empty or only whitespace, false if it contains any text content. + */ + public boolean isBlank() { + return StringUtils.isBlank(normaliseWhitespace(getWholeText())); + } + void outerHtml(StringBuilder accum) { String html = StringEscapeUtils.escapeHtml(getWholeText()); if (parent() instanceof Element && !((Element) parent()).preserveWhitespace()) { html = normaliseWhitespace(html); } + if (!isBlank() && parentNode instanceof Element && ((Element) parentNode).tag().canContainBlock() && siblingIndex() == 0) + indent(accum); accum.append(html); } diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index f5694a7702..5957958ff4 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -78,7 +78,7 @@ private Document parse() { parseTextNode(); } } - return doc; + return doc.normalise(); } private void parseComment() { diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index d5c210d114..67a4e50228 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -208,13 +208,13 @@ public String toString() { // head // all ancestors set to (head, body): so implicitly create head, but allow in body - createInline("SCRIPT").setAncestor("HEAD", "BODY").setContainDataOnly(); - createInline("NOSCRIPT").setAncestor("HEAD", "BODY"); - createInline("STYLE").setAncestor("HEAD", "BODY").setContainDataOnly(); - createInline("META").setAncestor("HEAD", "BODY").setEmpty(); + createBlock("SCRIPT").setAncestor("HEAD", "BODY").setContainDataOnly(); + createBlock("NOSCRIPT").setAncestor("HEAD", "BODY"); + createBlock("STYLE").setAncestor("HEAD", "BODY").setContainDataOnly(); + createBlock("META").setAncestor("HEAD", "BODY").setEmpty(); createBlock("LINK").setAncestor("HEAD", "BODY").setEmpty(); // only within head createInline("OBJECT").setAncestor("HEAD", "BODY"); // flow (block/inline) or param - createInline("TITLE").setAncestor("HEAD", "BODY").setContainDataOnly(); + createBlock("TITLE").setAncestor("HEAD", "BODY").setContainDataOnly(); createInline("BASE").setAncestor("HEAD", "BODY").setEmpty(); createBlock("FRAME").setAncestor("FRAMESET").setEmpty(); diff --git a/src/main/java/org/jsoup/select/Elements.java b/src/main/java/org/jsoup/select/Elements.java index 2efe5fc92a..2c0f66cd34 100644 --- a/src/main/java/org/jsoup/select/Elements.java +++ b/src/main/java/org/jsoup/select/Elements.java @@ -41,12 +41,16 @@ public String attr(String attributeKey) { } /** - Checks if the first matched value has this attribute set. + Checks if any of the matched elements have this attribute set. @param attributeKey attribute key - @return true if the first element has the attribute; false if it doesn't, or if no elements were matched. + @return true if any of the elements have the attribute; false if none do. */ public boolean hasAttr(String attributeKey) { - return !contents.isEmpty() && first().hasAttr(attributeKey); + for (Element element : contents) { + if (element.hasAttr(attributeKey)) + return true; + } + return false; } /** @@ -73,6 +77,55 @@ public Elements removeAttr(String attributeKey) { } return this; } + + /** + Add the class name to every matched element's {@code class} attribute. + @param className class name to add + @return this + */ + public Elements addClass(String className) { + for (Element element : contents) { + element.addClass(className); + } + return this; + } + + /** + Remove the class name from every matched element's {@code class} attribute, if present. + @param className class name to remove + @return this + */ + public Elements removeClass(String className) { + for (Element element : contents) { + element.removeClass(className); + } + return this; + } + + /** + Toggle the class name on every matched element's {@code class} attribute. + @param className class name to add if missing, or remove if present, from every element. + @return this + */ + public Elements toggleClass(String className) { + for (Element element : contents) { + element.toggleClass(className); + } + return this; + } + + /** + Determine if any of the matched elements have this class name set in their {@code class} attribute. + @param className class name to check for + @return true if any do, false if none do + */ + public boolean hasClass(String className) { + for (Element element : contents) { + if (element.hasClass(className)) + return true; + } + return false; + } /** * Get the combined text of all the matched elements. @@ -92,6 +145,14 @@ public String text() { return sb.toString(); } + public boolean hasText() { + for (Element element: contents) { + if (element.hasText()) + return true; + } + return false; + } + /** Wrap the supplied HTML around each matched elements. For example, with HTML {@code

This is Jsoup

}, diff --git a/src/test/java/org/jsoup/TextUtil.java b/src/test/java/org/jsoup/TextUtil.java index 101bba4542..735714b8b3 100644 --- a/src/test/java/org/jsoup/TextUtil.java +++ b/src/test/java/org/jsoup/TextUtil.java @@ -6,7 +6,7 @@ @author Jonathan Hedley, jonathan@hedley.net */ public class TextUtil { public static String stripNewlines(String text) { - text = text.replaceAll("[\\n\\r]", ""); + text = text.replaceAll("\\n\\s*", ""); return text; } } diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index 9df87fe76a..08ebf2f26a 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -150,6 +150,19 @@ public class ElementTest { assertFalse(doc.hasClass("mellow")); } + @Test public void testClassUpdates() { + Document doc = Jsoup.parse("
"); + Element div = doc.select("div").first(); + + div.addClass("green"); + assertEquals("mellow yellow green", div.className()); + div.removeClass("red"); // noop + div.removeClass("yellow"); + assertEquals("mellow green", div.className()); + div.toggleClass("green").toggleClass("red"); + assertEquals("mellow red", div.className()); + } + @Test public void testOuterHtml() { Document doc = Jsoup.parse("

Hello

there"); assertEquals("

Hello

there

", @@ -163,7 +176,7 @@ public class ElementTest { @Test public void testFormatHtml() { Document doc = Jsoup.parse("

Hello

"); - assertEquals("\n\n\n\n
\n

Hello

\n
\n\n", doc.html()); + assertEquals("\n\n\n\n
\n

Hello

\n
\n\n", doc.html()); } @Test public void testSetText() { @@ -198,7 +211,7 @@ public class ElementTest { Document doc = Jsoup.parse("

Hello

"); Element div = doc.getElementById("1"); div.appendText(" there & now >"); - assertEquals("

Hello

\n there & now >", div.html()); + assertEquals("

Hello

there & now >", TextUtil.stripNewlines(div.html())); } @Test public void testPrependText() { @@ -206,7 +219,7 @@ public class ElementTest { Element div = doc.getElementById("1"); div.prependText("there & now > "); assertEquals("there & now > Hello", div.text()); - assertEquals("there & now >

Hello

", div.html()); + assertEquals("there & now >

Hello

", TextUtil.stripNewlines(div.html())); } @Test public void testAddNewHtml() { @@ -250,5 +263,15 @@ public class ElementTest { assertEquals("

Hello

There!

", TextUtil.stripNewlines(doc.body().html())); } + @Test public void testHasText() { + Document doc = Jsoup.parse("

Hello

"); + Element div = doc.select("div").first(); + Elements ps = doc.select("p"); + + assertTrue(div.hasText()); + assertTrue(ps.first().hasText()); + assertFalse(ps.last().hasText()); + } + } diff --git a/src/test/java/org/jsoup/nodes/TextNodeTest.java b/src/test/java/org/jsoup/nodes/TextNodeTest.java new file mode 100644 index 0000000000..a5e21456d5 --- /dev/null +++ b/src/test/java/org/jsoup/nodes/TextNodeTest.java @@ -0,0 +1,25 @@ +package org.jsoup.nodes; + +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + Test TextNodes + + @author Jonathan Hedley, jonathan@hedley.net */ +public class TextNodeTest { + @Test public void testBlank() { + TextNode one = new TextNode("", ""); + TextNode two = new TextNode(" ", ""); + TextNode three = new TextNode(" \n\n ", ""); + TextNode four = new TextNode("Hello", ""); + TextNode five = new TextNode(" \nHello ", ""); + + assertTrue(one.isBlank()); + assertTrue(two.isBlank()); + assertTrue(three.isBlank()); + assertFalse(four.isBlank()); + assertFalse(five.isBlank()); + } +} diff --git a/src/test/java/org/jsoup/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/org/jsoup/safety/CleanerTest.java index ec0ffe05b5..db6a05f4b8 100644 --- a/src/test/java/org/jsoup/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/org/jsoup/safety/CleanerTest.java @@ -15,14 +15,14 @@ public class CleanerTest { String h = ""; String cleanHtml = Jsoup.clean(h, Whitelist.simpleText()); - assertEquals("Hello there!", cleanHtml); + assertEquals("Hello there!", TextUtil.stripNewlines(cleanHtml)); } @Test public void simpleBehaviourTest2() { String h = "Hello there!"; String cleanHtml = Jsoup.clean(h, Whitelist.simpleText()); - assertEquals("Hello there!", cleanHtml); + assertEquals("Hello there!", TextUtil.stripNewlines(cleanHtml)); } @Test public void basicBehaviourTest() { diff --git a/src/test/java/org/jsoup/parser/ParserTest.java b/src/test/java/org/jsoup/parser/ParserTest.java index b9ad058965..7fb9b5ca8e 100644 --- a/src/test/java/org/jsoup/parser/ParserTest.java +++ b/src/test/java/org/jsoup/parser/ParserTest.java @@ -221,7 +221,7 @@ public class ParserTest { @Test public void handlesFrames() { String h = ""; Document doc = Jsoup.parse(h); - assertEquals("", + assertEquals("", TextUtil.stripNewlines(doc.html())); } @@ -242,5 +242,17 @@ public class ParserTest { assertEquals("http://example.com/foo", a.attr("abs:href")); } + @Test public void normalisesDocument() { + String h = "OneTwoThreeFourFive Six Seven "; + Document doc = Jsoup.parse(h); + assertEquals("Five Six Seven One Two Four Three", + TextUtil.stripNewlines(doc.html())); // is spaced OK if not newline & space stripped + } + + @Test public void normlisesEmptyDocument() { + Document doc = Jsoup.parse(""); + assertEquals("",TextUtil.stripNewlines(doc.html())); + } + } diff --git a/src/test/java/org/jsoup/select/ElementsTest.java b/src/test/java/org/jsoup/select/ElementsTest.java index 9cbd88a626..c21c13d78f 100644 --- a/src/test/java/org/jsoup/select/ElementsTest.java +++ b/src/test/java/org/jsoup/select/ElementsTest.java @@ -37,6 +37,20 @@ public class ElementsTest { assertEquals("classy", ps.last().attr("style")); assertEquals("bar", ps.last().attr("class")); } + + @Test public void classes() { + Document doc = Jsoup.parse("

"); + + Elements els = doc.select("p"); + assertTrue(els.hasClass("red")); + assertFalse(els.hasClass("blue")); + els.addClass("blue"); + els.removeClass("yellow"); + els.toggleClass("mellow"); + + assertEquals("blue", els.get(0).className()); + assertEquals("red green blue mellow", els.get(1).className()); + } @Test public void text() { String h = "

Hello

there

world

"; @@ -44,6 +58,13 @@ public class ElementsTest { assertEquals("Hello there world", doc.select("div > *").text()); } + @Test public void hasText() { + Document doc = Jsoup.parse("

Hello

"); + Elements divs = doc.select("div"); + assertTrue(divs.hasText()); + assertFalse(doc.select("div + div").hasText()); + } + @Test public void wrap() { String h = "

This is jsoup

"; Document doc = Jsoup.parse(h);