-
Notifications
You must be signed in to change notification settings - Fork 79
/
parse.go
205 lines (192 loc) · 5.95 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
package tarfs
import (
"archive/tar"
"bytes"
"errors"
"fmt"
"io"
"strconv"
)
// The value we should find in the "magic" position of the tar header.
var (
magicPAX = []byte("ustar\x00")
magicGNU = []byte("ustar ")
magicOldGNU = []byte("ustar \x00")
)
// ErrFormat can be compared via [errors.Is] against errors reported by [New]
// to determine if the tar fail is considered well-formed.
var ErrFormat = errors.New("tarfs: format error reading file")
// ParseErr returns an error that .Is reports true for ErrFormat.
//
// The `%w` verb does not work.
func parseErr(f string, v ...interface{}) error {
return parseError(fmt.Sprintf(f, v...))
}
type parseError string
func (e parseError) Is(tgt error) bool { return tgt == ErrFormat }
func (e parseError) Error() string { return string(e) }
// FindSegments looks at a tar blockwise to establish where individual files and
// their headers are stored. Each returned segment describes a region that is
// not a complete tar file, but can have exactly one file read from it.
func findSegments(r io.ReaderAt) ([]segment, error) {
// Constants and offsets from POSIX.
const (
blockSz = 512
magicOff = 257
versionOff = 263
typeflag = 156
sizeOff = 124
)
b := make([]byte, blockSz)
var ret []segment
// Start block of the current segment.
var cur int64
// Block number being read.
var blk int64
// Has the parser seen a zeroes block.
var zeroes bool
Scan:
for {
off := blk * blockSz
n, err := r.ReadAt(b, off)
switch {
case errors.Is(err, nil) && n != blockSz:
// Should be impossible with a well-formed archive, so raise an
// error. Should also be impossible with a conforming [io.ReaderAt].
return nil, parseErr("short read at offset: %d (got: %d, want: %d)", off, n, blockSz)
case errors.Is(err, nil): // OK
case errors.Is(err, io.EOF):
switch {
case n == 0:
// Early EOF on a block boundary. Let it slide.
//
// Handle this case because some layers in the wild are a single
// directory block, with no trailer.
break Scan
case n == blockSz:
// Make sure to process the read, even if EOF was returned.
default:
return nil, parseErr("unexpected EOF at %d: %v", off, err)
}
default:
return nil, err
}
magic := b[magicOff:][:6]
zeroBlock := true
for _, b := range b {
if b != 0x00 {
zeroBlock = false
break
}
}
switch {
// Tar files end with two blocks of zeroes. These two arms track that.
case !zeroes && zeroBlock:
zeroes = true
continue
case zeroes && zeroBlock:
// Check for a valid second zeroes block.
break Scan
case zeroes && !zeroBlock:
// Found the first trailer block, but not the second.
return nil, parseErr("bad block at %d: expected second trailer block", off)
// These arms are belt-and-suspenders to make sure we're reading a
// header block and not a contents block, somehow.
case bytes.Equal(b[magicOff:][:8], magicOldGNU):
// OldGNU madness. This arm matching means the headers aren't
// actually POSIX conforming, but hopefully it's not an issue. Just
// roll with it. USTAR was standardized in 1988; frankly, it's the
// creator's fault if something doesn't work right because there's
// some incompatibility.
case !bytes.Equal(magic, magicPAX) && !bytes.Equal(magic, magicGNU):
return nil, parseErr("bad block at %d: got magic %+q", off, magic)
case !bytes.Equal(b[versionOff:][:2], []byte("00")):
return nil, parseErr("bad block at %d: got version %+q", off, b[versionOff:][:2])
}
encSz := b[sizeOff:][:12]
sz, err := parseNumber(encSz)
if err != nil {
return nil, parseErr("invalid number: %024x: %v", encSz, err)
}
nBlk := sz / blockSz
if sz%blockSz != 0 {
nBlk++
}
blk++ // Current header block
blk += nBlk // File contents
switch b[typeflag] {
case tar.TypeXHeader, tar.TypeGNULongLink, tar.TypeGNULongName, tar.TypeGNUSparse:
// All these are prepended to a "real" entry.
case tar.TypeBlock, tar.TypeChar, tar.TypeCont, tar.TypeDir, tar.TypeFifo, tar.TypeLink, tar.TypeReg, tar.TypeRegA, tar.TypeSymlink:
// Found a data block, emit it:
ret = append(ret, segment{start: cur * blockSz, size: (blk - cur) * blockSz})
fallthrough
default:
// any blocks not enumerated are not handled.
cur = blk
}
}
return ret, nil
}
// Segment describes one file in a tar, including relevant headers.
type segment struct {
start int64
size int64
}
// ParseNumber extracts a number from the encoded form in the tar header.
//
// This is based on the internal version in archive/tar.
func parseNumber(b []byte) (int64, error) {
// If in binary format, decode it.
if len(b) > 0 && b[0]&0x80 != 0 {
// See also: src/archive/tar/strconv.go
// Handling negative numbers relies on the following identity:
// -a-1 == ^a
//
// If the number is negative, we use an inversion mask to invert the
// data bytes and treat the value as an unsigned number.
var inv byte // 0x00 if positive or zero, 0xff if negative
if b[0]&0x40 != 0 {
inv = 0xff
}
var x uint64
for i, c := range b {
c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing
if i == 0 {
c &= 0x7f // Ignore signal bit in first byte
}
if (x >> 56) > 0 {
return 0, errors.New("integer overflow")
}
x = x<<8 | uint64(c)
}
if (x >> 63) > 0 {
return 0, errors.New("integer overflow")
}
if inv == 0xff {
return ^int64(x), nil
}
return int64(x), nil
}
// Otherwise, it's stringified.
b = bytes.Trim(b, " \x00")
if len(b) == 0 {
return 0, nil
}
n, err := strconv.ParseUint(cstring(b), 8, 63) // Only positive int64 values allowed.
if err != nil {
return 0, err
}
return int64(n), nil
}
// Cstring interprets the byte slice as a C string. If there is no NULL, it
// returns the entire slice as a string.
//
// The entire-slice behavior handles the case where a fixed size header field is
// fully populated.
func cstring(b []byte) string {
if i := bytes.IndexByte(b, 0); i >= 0 {
return string(b[:i])
}
return string(b)
}