This repository has been archived by the owner on Dec 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
decompressor.go
287 lines (225 loc) · 9.12 KB
/
decompressor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
package doboz
import "encoding/binary"
type CompressionInfo struct {
UncompressedSize uint64
CompressedSize uint64
Version int
}
type LookupTable struct {
mask uint // the mask for the entire encoded match
offsetShift byte
lengthMask byte
lengthShift byte
size int8 // the size of the encoded match in bytes
}
type Decompressor struct {
literalRunLengthTable []int8
lut []LookupTable
}
func (d *Decompressor) initialize() {
d.literalRunLengthTable = []int8{4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}
d.lut = []LookupTable{
{mask: 0xff, offsetShift: 2, lengthMask: 0, lengthShift: 0, size: 1}, // (0)00
{mask: 0xffff, offsetShift: 2, lengthMask: 0, lengthShift: 0, size: 2}, // (0)01
{mask: 0xffff, offsetShift: 6, lengthMask: 15, lengthShift: 2, size: 2}, // (0)10
{mask: 0xffffff, offsetShift: 8, lengthMask: 31, lengthShift: 3, size: 3}, // (0)11
{mask: 0xff, offsetShift: 2, lengthMask: 0, lengthShift: 0, size: 1}, // (1)00 = (0)00
{mask: 0xffff, offsetShift: 2, lengthMask: 0, lengthShift: 0, size: 2}, // (1)01 = (0)01
{mask: 0xffff, offsetShift: 6, lengthMask: 15, lengthShift: 2, size: 2}, // (1)10 = (0)10
{mask: 0xffffffff, offsetShift: 11, lengthMask: 255, lengthShift: 3, size: 4}, // 111
}
}
// Decompresses a block of data
// The source and destination buffers must not overlap
// This operation is memory safe
// On success, returns RESULT_OK
func (d *Decompressor) Decompress(source []byte, destination []byte) Result {
d.initialize()
inputBuffer := source
inputIterator := 0
outputBuffer := destination
outputIterator := 0
// Decode the header
decodeHeaderResult, header, headerSize := d.decodeHeader(source)
if decodeHeaderResult != RESULT_OK {
return decodeHeaderResult
}
inputIterator += headerSize
if header.Version != VERSION {
return RESULT_ERROR_UNSUPPORTED_VERSION
}
// Check whether the supplied buffers are large enough
if uint64(len(source)) < header.CompressedSize || uint64(len(destination)) < header.UncompressedSize {
return RESULT_ERROR_BUFFER_TOO_SMALL
}
uncompressedSize := int(header.UncompressedSize)
// If the data is simply stored, copy it to the destination buffer and we're done
if header.IsStored {
copy(outputBuffer[:uncompressedSize], inputBuffer[inputIterator:])
return RESULT_OK
}
inputEnd := int(header.CompressedSize)
outputEnd := uncompressedSize
// Compute pointer to the first byte of the output 'tail'
// Fast write operations can be used only before the tail, because those may write beyond the end of the output buffer
outputTail := 0
if uncompressedSize > TAIL_LENGTH {
outputTail = outputEnd - TAIL_LENGTH
}
// Initialize the control word to 'empty'
controlWord := uint(1)
// Decoding loop
for {
// Check whether there is enough data left in the input buffer
// In order to decode the next literal/match, we have to read up to 8 bytes (2 words)
// Thanks to the trailing dummy, there must be at least 8 remaining input bytes
if inputIterator+2*WORD_SIZE > inputEnd {
return RESULT_ERROR_CORRUPTED_DATA
}
// Check whether we must read a control word
if controlWord == 1 {
controlWord = FastRead(inputBuffer[inputIterator:], WORD_SIZE)
inputIterator += WORD_SIZE
}
// Detect whether it's a literal or a match
if (controlWord & 1) == 0 {
// It's a literal
// If we are before the tail, we can safely use fast writing operations
if outputIterator < outputTail {
// We copy literals in runs of up to 4 because it's faster than copying one by one
// Copy implicitly 4 literals regardless of the run length
FastWrite(outputBuffer[outputIterator:], FastRead(inputBuffer[inputIterator:], WORD_SIZE), WORD_SIZE)
// Get the run length using a lookup table
runLength := int(d.literalRunLengthTable[controlWord&0xf])
// Advance the inputBuffer and outputBuffer pointers with the run length
inputIterator += runLength
outputIterator += runLength
// Consume as much control word bits as the run length
controlWord >>= runLength
} else {
// We have reached the tail, we cannot output literals in runs anymore
// Output all remaining literals
for outputIterator < outputEnd {
// Check whether there is enough data left in the input buffer
// In order to decode the next literal, we have to read up to 5 bytes
if inputIterator+WORD_SIZE+1 > inputEnd {
return RESULT_ERROR_CORRUPTED_DATA
}
// Check whether we must read a control word
if controlWord == 1 {
controlWord = FastRead(inputBuffer[inputIterator:], WORD_SIZE)
inputIterator += WORD_SIZE
}
// Output one literal
// We cannot use fast read/write functions
outputBuffer[outputIterator] = inputBuffer[inputIterator] // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ++i vagy i++ ?
outputIterator++
inputIterator++
// Next control word bit
controlWord >>= 1
}
// Done
return RESULT_OK
}
} else {
// It's a match
// Decode the match
match, matchSize := d.decodeMatch(inputBuffer[inputIterator:])
inputIterator += matchSize
// Copy the matched string
// In order to achieve high performance, we copy characters in groups of machine words
// Overlapping matches require special care
matchString := outputIterator - match.Offset
// Check whether the match is out of range
if matchString < 0 || outputIterator+match.Length > outputTail {
return RESULT_ERROR_CORRUPTED_DATA
}
i := 0
if match.Offset < WORD_SIZE {
// The match offset is less than the word size
// In order to correctly handle the overlap, we have to copy the first three bytes one by one
for i < 3 {
FastWrite(outputBuffer[outputIterator+i:], FastRead(outputBuffer[matchString+i:], 1), 1) // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2. input v output?
i++
}
// With this trick, we increase the distance between the source and destination pointers
// This enables us to use fast copying for the rest of the match
matchString -= 2 + (match.Offset & 1)
}
// Fast copying
// There must be no overlap between the source and destination words
for ok := true; ok; ok = i < match.Length {
FastWrite(outputBuffer[outputIterator+i:], FastRead(outputBuffer[matchString+i:], WORD_SIZE), WORD_SIZE)
i += WORD_SIZE
}
outputIterator += match.Length
// Next control word bit
controlWord >>= 1
}
}
}
// Retrieves information about a compressed block of data
// This operation is memory safe
// On success, returns RESULT_OK and outputs the compression information
func (d *Decompressor) GetCompressionInfo(source []byte) (Result, CompressionInfo) {
var compressionInfo CompressionInfo
// Decode the header
decodeHeaderResult, header, _ := d.decodeHeader(source)
if decodeHeaderResult != RESULT_OK {
return decodeHeaderResult, compressionInfo
}
// Return the requested info
compressionInfo.UncompressedSize = header.UncompressedSize
compressionInfo.CompressedSize = header.CompressedSize
compressionInfo.Version = header.Version
return RESULT_OK, compressionInfo
}
// Decodes a match and returns its size in bytes
func (d *Decompressor) decodeMatch(source []byte) (Match, int) {
// Read the maximum number of bytes a match is coded in (4)
word := FastRead(source, WORD_SIZE)
// Compute the decoding lookup table entry index: the lowest 3 bits of the encoded match
i := word & 7
// Compute the match offset and length using the lookup table entry
var match Match
match.Offset = (int)((word & d.lut[i].mask) >> d.lut[i].offsetShift)
match.Length = (int)(((word >> uint(d.lut[i].lengthShift)) & uint(d.lut[i].lengthMask)) + MIN_MATCH_LENGTH)
return match, int(d.lut[i].size)
}
// Decodes a header and returns its size in bytes
// If the header is not valid, the function returns 0
func (d *Decompressor) decodeHeader(source []byte) (Result, Header, int) {
var header Header
// Decode the attribute bytes
if len(source) < 1 {
return RESULT_ERROR_BUFFER_TOO_SMALL, header, 0
}
attributes := uint(source[0])
source = source[1:]
header.Version = int(attributes & 7)
sizeCodedSize := int((attributes>>3)&7) + 1
// Compute the size of the header
headerSize := 1 + 2*sizeCodedSize
if len(source) < headerSize {
return RESULT_ERROR_BUFFER_TOO_SMALL, header, headerSize
}
header.IsStored = (attributes & 128) != 0
// Decode the uncompressed and compressed sizes
switch sizeCodedSize {
case 1:
header.UncompressedSize = uint64(source[0])
header.CompressedSize = uint64(source[sizeCodedSize])
case 2:
header.UncompressedSize = uint64(binary.LittleEndian.Uint16(source))
header.CompressedSize = uint64(binary.LittleEndian.Uint16(source[sizeCodedSize:]))
case 4:
header.UncompressedSize = uint64(binary.LittleEndian.Uint32(source))
header.CompressedSize = uint64(binary.LittleEndian.Uint32(source[sizeCodedSize:]))
case 8:
header.UncompressedSize = binary.LittleEndian.Uint64(source)
header.CompressedSize = binary.LittleEndian.Uint64(source[sizeCodedSize:])
default:
return RESULT_ERROR_CORRUPTED_DATA, header, headerSize
}
return RESULT_OK, header, headerSize
}