/
data.rb
254 lines (223 loc) · 7.02 KB
/
data.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
require "cgi/util"
require "uri"
require "open-uri"
require "chupa-text/utf8-converter"
module ChupaText
class Data
# @return [URI, nil] The URI of the data if the data is for remote
# or local file, `nil` if the data isn't associated with any
# URIs.
attr_reader :uri
# @return [String, nil] The content of the data, `nil` if the data
# doesn't have any content.
attr_accessor :body
# @return [Integer, nil] The byte size of the data, `nil` if the data
# doesn't have any content.
attr_accessor :size
# @return [String, nil] The path associated with the content of
# the data, `nil` if the data doesn't associated with any file.
#
# The path may not be related with the original content. For
# example, `"/tmp/XXX.txt"` may be returned for the data of
# `"http://example.com/XXX.txt"`.
#
# This value is useful to use an external command to extract
# text and meta-data.
attr_accessor :path
# @return [Attributes] The attributes of the data.
attr_reader :attributes
# @return [Data, nil] The source of the data. For example, text
# data (`hello.txt`) in archive data (`hello.tar`) have the
# archive data in {#source}.
attr_accessor :source
# @return [Screenshot, nil] The screenshot of the data. For example,
# the first page image for PDF file.text.
attr_accessor :screenshot
# @param [Bool] value `true` when screenshot is needed.
# @return [Bool] the specified value
attr_writer :need_screenshot
# @return [Array<Integer, Integer>] the expected screenshot size.
attr_accessor :expected_screenshot_size
def initialize(options={})
@uri = nil
@body = nil
@size = nil
@path = nil
@mime_type = nil
@attributes = Attributes.new
@source = nil
@screenshot = nil
@need_screenshot = true
@expected_screenshot_size = [200, 200]
@options = options || {}
source_data = @options[:source_data]
if source_data
merge!(source_data)
@source = source_data
end
end
def initialize_copy(object)
super
@attributes = @attributes.dup
self
end
# Merges metadata from data.
#
# @param [Data] data The data to be merged.
#
# @return [void]
def merge!(data)
self.uri = data.uri
self.path = data.path
data.attributes.each do |name, value|
self[name] = value
end
if data.mime_type
self["source-mime-types"] ||= []
self["source-mime-types"].unshift(data.mime_type)
end
self.need_screenshot = data.need_screenshot?
self.expected_screenshot_size = data.expected_screenshot_size
end
# @param [String, URI, nil] uri The URI for the data. If `uri` is
# `nil`, it means that the data isn't associated with any URIs.
def uri=(uri)
case uri
when Pathname
file_uri = ""
target = uri.expand_path
loop do
target, base = target.split
file_uri = "/#{CGI.escape(base.to_s)}#{file_uri}"
break if target.root?
end
file_uri = "file://#{file_uri}"
@uri = URI.parse(file_uri)
self.path = uri
when NilClass
@uri = nil
self.path = nil
else
unless uri.is_a?(URI)
uri = URI.parse(uri)
end
@uri = uri
self.path = @uri.path
end
end
def open
yield(StringIO.new(body))
end
def peek_body(size)
_body = body
return nil if _body.nil?
_body[0, size]
end
def [](name)
@attributes[name]
end
def []=(name, value)
@attributes[name] = value
end
# @return [String] The MIME type of the data. If MIME type
# isn't set, guesses MIME type from path and body.
# @return [nil] If MIME type isn't set and it can't guess MIME type
# from path and body.
def mime_type
@mime_type || guess_mime_type
end
# @param [String, nil] type The MIME type of the data. You can
# unset MIME type by `nil`. If you unset MIME type, MIME type
# is guessed from path and body of the data.
def mime_type=(type)
@mime_type = type
end
# @return [String, nil] Normalized extension as String if {#uri}
# is not `nil`, `nil` otherwise. The normalized extension uses
# lower case like `pdf` not `PDF`.
def extension
return nil if @uri.nil?
if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/")
"html"
else
File.extname(@uri.path).downcase.gsub(/\A\./, "")
end
end
# @return [Bool] true if MIME type is "text/XXX", false
# otherwise.
def text?
(mime_type || "").start_with?("text/")
end
# @return [Bool] true if MIME type is "text/plain", false
# otherwise.
def text_plain?
mime_type == "text/plain"
end
# @return [Bool] `true` when screenshot is needed if available.
def need_screenshot?
@need_screenshot
end
def to_utf8_body_data(max_body_size: nil)
b = nil
if max_body_size
open do |input|
b = input.read(max_body_size)
end
else
b = body
end
return self if b.nil?
converter = UTF8Converter.new(b)
utf8_body = converter.convert
if max_body_size.nil? and b.equal?(utf8_body)
self
else
TextData.new(utf8_body, source_data: self)
end
end
private
def guess_mime_type
guess_mime_type_from_uri or
guess_mime_type_from_body
end
def guess_mime_type_from_uri
MIMEType.registry.find(extension)
end
def guess_mime_type_from_body
mime_type = nil
chunk = peek_body(1024)
change_encoding(chunk, "UTF-8") do |utf8_chunk|
return nil unless utf8_chunk.valid_encoding?
n_null_characters = utf8_chunk.count("\u0000")
return nil if n_null_characters > (utf8_chunk.bytesize * 0.01)
mime_type = "text/plain"
end
mime_type
end
def change_encoding(string, encoding)
return if string.nil?
begin
original_encoding = string.encoding
string.force_encoding(encoding)
yield(string)
ensure
string.force_encoding(original_encoding)
end
end
end
end