-
Notifications
You must be signed in to change notification settings - Fork 0
/
codes.go
188 lines (166 loc) · 4.8 KB
/
codes.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
// seehuhn.de/go/pdf - a library for reading and writing PDF files
// Copyright (C) 2023 Jochen Voss <voss@seehuhn.de>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package charcode
import (
"seehuhn.de/go/pdf"
)
// CharCode represents a character code within a [CodeSpaceRange] as a non-negative integer.
//
// TODO(voss): remove
type CharCode int
// CodeSpaceRange describes the ranges of byte sequences which are valid
// character codes for a given encoding.
type CodeSpaceRange []Range
// AllCodes returns an iterator over all character codes in the given PDF string.
func (c CodeSpaceRange) AllCodes(s pdf.String) func(yield func(code pdf.String, valid bool) bool) bool {
return func(yield func(pdf.String, bool) bool) bool {
for len(s) > 0 {
k, valid := c.firstCode(s)
if !yield(s[:k], valid) {
return false
}
s = s[k:]
}
return true
}
}
// firstCode returns the length of the first character code from the given PDF string,
// together with a boolean indicating whether the code was valid.
//
// See the algorithm from section 9.7.6.3 of the PDF-2.0 spec.
func (c CodeSpaceRange) firstCode(s pdf.String) (int, bool) {
candiates := make([]int, len(c))
for j := range candiates {
candiates[j] = j
}
if len(s) == 0 {
return 0, false
}
var skipLen int
for i := 0; i < len(s); i++ {
skipLen = len(s)
b := s[i]
for j := 0; j < len(candiates); {
r := c[candiates[j]]
L := len(r.Low)
if L < skipLen {
skipLen = L
}
if L <= i || b < r.Low[i] || b > r.High[i] {
candiates[j] = candiates[len(candiates)-1]
candiates = candiates[:len(candiates)-1]
} else if L == i+1 {
return i + 1, true
} else {
j++
}
}
if len(candiates) == 0 {
break
}
}
return skipLen, false
}
// Append appends the given character code to the given PDF string.
func (c CodeSpaceRange) Append(s pdf.String, code CharCode) pdf.String {
for _, r := range c {
if numCodes := r.numCodes(); code >= numCodes {
code -= numCodes
continue
}
n := len(s)
for range r.Low {
s = append(s, 0)
}
for i := len(r.Low) - 1; i >= 0; i-- {
k := CharCode(r.High[i]) - CharCode(r.Low[i]) + 1
s[n+i] = r.Low[i] + byte(code%k)
code /= k
}
break
}
return s
}
// Decode decodes the first character code from the given PDF string.
// It returns the character code and the number of bytes consumed.
// If the character code cannot be decoded, a code of -1 is returned,
// and the length is either 0 (if the string is empty) or 1.
//
// TODO(voss): Remove? Otherwise, fix the the number of bytes consumed
// for invalid codes as described in the PDF spec.
func (c CodeSpaceRange) Decode(s pdf.String) (CharCode, int) {
var base CharCode
tryNextRange:
for _, r := range c {
numCodes := r.numCodes()
if len(s) < len(r.Low) {
base += numCodes
continue tryNextRange
}
var code CharCode
for i := 0; i < len(r.Low); i++ {
b := s[i]
if b < r.Low[i] || b > r.High[i] {
base += numCodes
continue tryNextRange
}
k := CharCode(r.High[i]) - CharCode(r.Low[i]) + 1
code = code*k + CharCode(b-r.Low[i])
}
return code + base, len(r.Low)
}
if len(s) == 0 {
return -1, 0
}
return -1, 1
}
// Range represents a range of character codes.
// The range is inclusive, i.e. the character codes Low and High are
// part of the range.
// Low and High must have the same length.
type Range struct {
Low, High []byte
}
func (r Range) numCodes() CharCode {
var numCodes CharCode = 1
for i, low := range r.Low {
numCodes *= CharCode(r.High[i]-low) + 1
}
return numCodes
}
// Matches returns true, if the PDF string starts with a character code
// in the given range.
func (r Range) Matches(s pdf.String) bool {
if len(s) < len(r.Low) {
return false
}
for i, low := range r.Low {
if s[i] < low {
return false
}
if s[i] > r.High[i] {
return false
}
}
return true
}
// Simple represents the code space range for a simple font.
// Character codes are one byte long, and correspond directly to
// the bytes in the PDF string.
var Simple = CodeSpaceRange{{[]byte{0x00}, []byte{0xFF}}}
// UCS2 represents a two-byte encoding.
// Character codes are two bytes long, and are stored in big-endian order.
var UCS2 = CodeSpaceRange{{[]byte{0x00, 0x00}, []byte{0xFF, 0xFF}}}