forked from jhy/jsoup
/
Document.java
149 lines (127 loc) · 4.16 KB
/
Document.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package org.jsoup.nodes;
import org.apache.commons.lang.Validate;
import org.jsoup.parser.Tag;
import java.util.List;
import java.util.ArrayList;
/**
A HTML Document.
@author Jonathan Hedley, jonathan@hedley.net */
public class Document extends Element {
/**
Create a new, empty Document.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String baseUri) {
super(Tag.valueOf("#root"), baseUri);
}
/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
@return document with html, head, and body elements.
*/
static public Document createShell(String baseUri) {
Validate.notNull(baseUri);
Document doc = new Document(baseUri);
Element html = doc.appendElement("html");
html.appendElement("head");
html.appendElement("body");
return doc;
}
/**
Accessor to the document's {@code head} element.
@return {@code head}
*/
public Element head() {
return getElementsByTag("head").first();
}
/**
Accessor to the document's {@code body} element.
@return {@code body}
*/
public Element body() {
return getElementsByTag("body").first();
}
/**
Get the string contents of the document's {@code title} element.
@return Trimed title, or empty string if none set.
*/
public String title() {
Element titleEl = getElementsByTag("title").first();
return titleEl != null ? titleEl.text().trim() : "";
}
/**
Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
not present
@param title string to set as title
*/
public void title(String title) {
Validate.notNull(title);
Element titleEl = getElementsByTag("title").first();
if (titleEl == null) { // add to head
head().appendElement("title").text(title);
} else {
titleEl.text(title);
}
}
/**
Create a new Element, with this document's base uri. Does not make the new element a child of this document.
@param tagName element tag name (e.g. {@code a})
@return new element
*/
public Element createElement(String tagName) {
return new Element(Tag.valueOf(tagName), this.baseUri());
}
/**
Normalise the document. This happens after the parse phase so generally does not need to be called.
Moves any text content that is not in the body element into the body.
@return this document after normalisation
*/
public Document normalise() {
if (select("html").isEmpty())
appendElement("html");
if (head() == null)
select("html").first().appendElement("head");
if (body() == null)
select("html").first().appendElement("body");
normalise(this);
normalise(select("html").first());
normalise(head());
return this;
}
// does not recurse. the result order isn't great here (not intuitive); they are in the body though.
private void normalise(Element element) {
List<Node> toMove = new ArrayList<Node>();
for (Node node: element.childNodes) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
if (!tn.isBlank())
toMove.add(tn);
}
}
for (Node node: toMove) {
element.removeChild(node);
body().appendChild(new TextNode(" ", ""));
body().appendChild(node);
}
}
@Override
public String outerHtml() {
return super.html(); // no outer wrapper tag
}
/**
Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
@param text unencoded text
@return this document
*/
@Override
public Element text(String text) {
body().text(text); // overridden to not nuke doc structure
return this;
}
@Override
public String nodeName() {
return "#document";
}
}