/
document.rb
214 lines (194 loc) · 7.22 KB
/
document.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# coding: utf-8
# frozen_string_literal: true
require "pathname"
module Nokogiri
module HTML4
class Document < Nokogiri::XML::Document
###
# Get the meta tag encoding for this document. If there is no meta tag,
# then nil is returned.
def meta_encoding
if (meta = at_xpath("//meta[@charset]"))
meta[:charset]
elsif (meta = meta_content_type)
meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
end
end
###
# Set the meta tag encoding for this document.
#
# If an meta encoding tag is already present, its content is
# replaced with the given text.
#
# Otherwise, this method tries to create one at an appropriate
# place supplying head and/or html elements as necessary, which
# is inside a head element if any, and before any text node or
# content element (typically <body>) if any.
#
# The result when trying to set an encoding that is different
# from the document encoding is undefined.
#
# Beware in CRuby, that libxml2 automatically inserts a meta tag
# into a head element.
def meta_encoding=(encoding)
if (meta = meta_content_type)
meta["content"] = format("text/html; charset=%s", encoding)
encoding
elsif (meta = at_xpath("//meta[@charset]"))
meta["charset"] = encoding
else
meta = XML::Node.new("meta", self)
if (dtd = internal_subset) && dtd.html5_dtd?
meta["charset"] = encoding
else
meta["http-equiv"] = "Content-Type"
meta["content"] = format("text/html; charset=%s", encoding)
end
if (head = at_xpath("//head"))
head.prepend_child(meta)
else
set_metadata_element(meta)
end
encoding
end
end
def meta_content_type
xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
node["http-equiv"] =~ /\AContent-Type\z/i
end
end
private :meta_content_type
###
# Get the title string of this document. Return nil if there is
# no title tag.
def title
(title = at_xpath("//title")) && title.inner_text
end
###
# Set the title string of this document.
#
# If a title element is already present, its content is replaced
# with the given text.
#
# Otherwise, this method tries to create one at an appropriate
# place supplying head and/or html elements as necessary, which
# is inside a head element if any, right after a meta
# encoding/charset tag if any, and before any text node or
# content element (typically <body>) if any.
def title=(text)
tnode = XML::Text.new(text, self)
if (title = at_xpath("//title"))
title.children = tnode
return text
end
title = XML::Node.new("title", self) << tnode
if (head = at_xpath("//head"))
head << title
elsif (meta = at_xpath("//meta[@charset]") || meta_content_type)
# better put after charset declaration
meta.add_next_sibling(title)
else
set_metadata_element(title)
end
end
def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
if (head = at_xpath("//head"))
head << element
elsif (html = at_xpath("//html"))
head = html.prepend_child(XML::Node.new("head", self))
head.prepend_child(element)
elsif (first = children.find do |node|
case node
when XML::Element, XML::Text
true
end
end)
# We reach here only if the underlying document model
# allows <html>/<head> elements to be omitted and does not
# automatically supply them.
first.add_previous_sibling(element)
else
html = add_child(XML::Node.new("html", self))
head = html.add_child(XML::Node.new("head", self))
head.prepend_child(element)
end
end
private :set_metadata_element
####
# Serialize Node using +options+. Save options can also be set using a block.
#
# See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
#
# These two statements are equivalent:
#
# node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
#
# or
#
# node.serialize(:encoding => 'UTF-8') do |config|
# config.format.as_xml
# end
#
def serialize(options = {})
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
super
end
####
# Create a Nokogiri::XML::DocumentFragment from +tags+
def fragment(tags = nil)
DocumentFragment.new(self, tags, root)
end
# :call-seq:
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
#
# [Returns] The document type which determines CSS-to-XPath translation.
#
# See XPathVisitor for more information.
def xpath_doctype
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
end
class << self
###
# Parse HTML. +string_or_io+ may be a String, or any object that
# responds to _read_ and _close_ such as an IO, or StringIO.
# +url+ is resource where this document is located. +encoding+ is the
# encoding that should be used when processing the document. +options+
# is a number that sets options in the parser, such as
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
# Nokogiri::XML::ParseOptions.
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
if string_or_io.respond_to?(:encoding)
unless string_or_io.encoding == Encoding::ASCII_8BIT
encoding ||= string_or_io.encoding.name
end
end
if string_or_io.respond_to?(:read)
if string_or_io.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
url ||= string_or_io.path
end
unless encoding
string_or_io = EncodingReader.new(string_or_io)
begin
return read_io(string_or_io, url, encoding, options.to_i)
rescue EncodingReader::EncodingFound => e
encoding = e.found_encoding
end
end
return read_io(string_or_io, url, encoding, options.to_i)
end
# read_memory pukes on empty docs
if string_or_io.nil? || string_or_io.empty?
return encoding ? new.tap { |i| i.encoding = encoding } : new
end
encoding ||= EncodingReader.detect_encoding(string_or_io)
read_memory(string_or_io, url, encoding, options.to_i)
end
end
end
end
end