/
PDPage.jl
338 lines (292 loc) · 8.67 KB
/
PDPage.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
export PDPage,
pdPageGetContents,
pdPageIsEmpty,
pdPageGetCosObject,
pdPageGetContentObjects,
pdPageGetFonts,
pdPageGetMediaBox,
pdPageGetCropBox,
pdPageExtractText,
pdPageGetPageNumber,
pdPageEvalContent
using ..Cos
"""
```
pdPageGetCosObject(page::PDPage) -> CosObject
```
PDF document format is developed in two layers. A logical PDF document
information is represented over a physical file structure called COS. This method
provides the internal COS object associated with the page object.
"""
pdPageGetCosObject(page::PDPage) = page.cospage
"""
```
pdPageGetContents(page::PDPage) -> CosObject
```
Page rendering objects are normally stored in a `CosStream` object in a PDF file.
This method provides access to the stream object.
Please refer to the PDF specification for further details.
# Example
```
julia> pdPageGetContents(page)
448 0 obj
<<
/Length 437
/FFilter /FlateDecode
/F (/tmp/tmpZnGGFn/tmp5J60vr)
>>
stream
...
endstream
endobj
```
"""
function pdPageGetContents(page::PDPage)
if (page.contents === CosNull)
ref = get_page_content_ref(page)
page.contents = get_page_contents(page, ref)
end
return page.contents
end
"""
```
pdPageGetMediaBox(page::PDPage) -> CDRect{Float32}
pdPageGetCropBox(page::PDPage) -> CDRect{Float32}
```
Returns the media box associated with the page. See 14.11.2 PDF 1.7 Spec.
It's typically, the designated size of the paper for the page. When a crop box
is not defined, it defaults to the media box.
# Example
```
julia> pdPageGetMediaBox(page)
Rect:[0.0 0.0 595.0 792.0]
julia> pdPageGetCropBox(page)
Rect:[0.0 0.0 595.0 792.0]
```
"""
function pdPageGetMediaBox(page::PDPage)
arr = page_find_attribute(page, cn"MediaBox")::CosArray
return CDRect{Float32}(CDRect(arr))::CDRect{Float32}
end
function pdPageGetCropBox(page::PDPage)
box = page_find_attribute(page, cn"CropBox")
box === CosNull && return pdPageGetMediaBox(page)
return CDRect{Float32}(CDRect(box))::CDRect{Float32}
end
"""
```
pdPageIsEmpty(page::PDPage) -> Bool
```
Returns `true` when the page has no associated content object.
# Example
```
julia> pdPageIsEmpty(page)
false
```
"""
function pdPageIsEmpty(page::PDPage)
return page.contents === CosNull && get_page_content_ref(page) === CosNull
end
"""
```
pdPageGetContentObjects(page::PDPage) -> CosObject
```
Page rendering objects are normally stored in a `CosStream` object in a PDF file.
This method provides access to the stream object.
"""
function pdPageGetContentObjects(page::PDPage)
page.content_objects === nothing && load_page_objects(page)
return page.content_objects
end
"""
```
pdPageGetFonts(page::PDPage) -> Dict{CosName, PDFont}()
```
Returns a dictionary of fonts in the page.
#Example
```
julia> pdPageGetFonts(page)
Dict{CosName,PDFIO.PD.PDFont} with 4 entries:
/F0 => PDFont(…
/F4 => PDFont(…
/F8 => PDFont(…
/F9 => PDFont(…
```
"""
function pdPageGetFonts(page::PDPage)
cosfonts = find_resource(page, cn"Font", CosNull)
dres = Dict{CosName, PDFont}()
for (name, val) in cosfonts.val
dres[name] = PDFont(page.doc, val)
end
return dres
end
function pdPageEvalContent(page::PDPage, state::GState=GState{:PDFIO}())
state[:source] = page
evalContent!(pdPageGetContentObjects(page), state)
return state
end
"""
```
pdPageExtractText(io::IO, page::PDPage) -> IO
```
Extracts the text from the `page`. This extraction works best for tagged PDF
files.
For PDFs not tagged, some line and word breaks will not be extracted properly.
# Example
Following code will extract the text from a full PDF file.
```
function getPDFText(src, out)
doc = pdDocOpen(src)
docinfo = pdDocGetInfo(doc)
open(out, "w") do io
npage = pdDocGetPageCount(doc)
for i=1:npage
page = pdDocGetPage(doc, i)
pdPageExtractText(io, page)
end
end
pdDocClose(doc)
return docinfo
end
```
"""
function pdPageExtractText(io::IO, page::PDPage)
state = pdPageEvalContent(page)
show_text_layout!(io, state)
return io
end
"""
```
pdPageGetPageNumber(page::PDPage)
```
Returns the page number of the document page.
# Example
```
julia> pdPageGetPageNumber(page)
1
```
"""
pdPageGetPageNumber(page::PDPage) =
pd_doc_get_pagenum(page.doc, CosIndirectObjectRef(page.cospage))
mutable struct PDPageImpl <: PDPage
doc::PDDocImpl
cospage::ID{CosDict}
contents::CosObject
content_objects::Union{Nothing, PDPageObjectGroup}
fonts::Dict{CosName, PDFont}
xobjs::Dict{CosName, PDXObject}
PDPageImpl(doc, cospage, contents) =
new(doc, cospage, contents,
nothing,
Dict{CosName,PDFont}(),
Dict{CosName,PDXObject}())
end
PDPageImpl(doc::PDDocImpl, cospage::ID{CosDict}) =
PDPageImpl(doc, cospage, CosNull)
#=This function is added as non-exported type. PDPage may need other attributes
which will make the constructor complex. This is the default with all default
values.
=#
create_pdpage(doc::PDDocImpl, cospage::ID{CosDict}) =
PDPageImpl(doc, cospage)
create_pdpage(doc::PDDocImpl, cospage::CosNullType) =
throw(ErorException(E_INVALID_OBJECT))
#=
This will return a CosArray of ref or ref to a stream. This needs to be
converted to an actual stream object
=#
get_page_content_ref(page::PDPageImpl) = get(page.cospage, cn"Contents")
function get_page_contents(page::PDPageImpl, contents::CosArray)
len = length(contents)
arr = get(contents)
for i = 1:len
ref = splice!(arr, 1)
cosstm = get_page_contents(page, ref)
cosstm !== CosNull && push!(arr, cosstm)
end
stm = merge_streams(page.doc.cosDoc, contents)
return stm
end
get_page_contents(page::PDPageImpl, contents::CosIndirectObjectRef) =
cosDocGetObject(page.doc.cosDoc, contents)
get_page_contents(page::PDPage, obj::IDD{CosStream}) = obj
@inline function load_page_objects(page::PDPageImpl)
contents = pdPageGetContents(page)
page.content_objects === nothing &&
(page.content_objects = PDPageObjectGroup())
return load_page_objects(page, contents)
end
load_page_objects(page::PDPageImpl, stm::CosNullType) = nothing
@inline function load_page_objects(page::PDPageImpl, stm::IDD{CosStream})
bufstm = decode(stm)
try
load_objects(page.content_objects, bufstm)
finally
util_close(bufstm)
end
return nothing
end
@inline function load_page_objects(page::PDPageImpl, stms::IDD{CosArray})
stm = merge_streams(page.doc.cosDoc, stms)
page.contents = stm
return load_page_objects(page, stm)
end
function find_resource(page::PDPageImpl,
restype::CosName,
fontname::Union{CosName, CosNullType})
res = CosNull
cosdoc = page.doc.cosDoc
pgnode = page.cospage
while ((fontname !== CosNull && res === CosNull) ||
(fontname === CosNull)) && pgnode !== CosNull
resref = get(pgnode, cn"Resources")
if resref === CosNull
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
continue
end
resources = cosDocGetObject(cosdoc, resref)
if resources === CosNull
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
continue
end
ress = cosDocGetObject(cosdoc, resources, restype)
if ress === CosNull
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
continue
end
if fontname !== CosNull
res = cosDocGetObject(cosdoc, ress, fontname)
else
resdict = cosDocGetObject(cosdoc, ress, fontname)
res === CosNull && (res = CosDict())
for (k, v) in resdict.val
set!(res, k, v)
end
end
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
end
return res
end
get_font(page::PDPageImpl, fontname::CosName) =
get!(page.fonts, fontname,
get_pd_font!(page.doc, find_resource(page, cn"Font", fontname)))
get_xobject(page::PDPageImpl, xobjname::CosName) =
get!(page.xobjs, xobjname,
get_pd_xobject!(page.doc,
find_resource(page, cn"XObject", xobjname)))
function page_find_attribute(page::PDPageImpl, resname::CosName)
res = CosNull
cosdoc = page.doc.cosDoc
pgnode = page.cospage
while pgnode !== CosNull
res = cosDocGetObject(cosdoc, pgnode, resname)
res !== CosNull && break
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
end
return res
end
get_encoded_string(s::CosString, fontname::CosNullType, page::PDPage) =
CDTextString(s)
get_encoded_string(s::CosString, fontname::CosName, page::PDPage) =
get_encoded_string(s, get(page.fonts, fontname, nothing))