/
par2file.py
257 lines (225 loc) · 10.7 KB
/
par2file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/python3 -OO
# Copyright 2007-2024 by The SABnzbd-Team (sabnzbd.org)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
sabnzbd.par2file - All par2-related functionality
"""
import hashlib
import logging
import os
import re
import struct
import sabctools
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
from sabnzbd.constants import MEBI
from sabnzbd.encoding import correct_unknown_encoding
from sabnzbd.filesystem import get_basename
PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I)
SCAN_LIMIT = 10 * MEBI
PAR_PKT_ID = b"PAR2\x00PKT"
PAR_MAIN_ID = b"PAR 2.0\x00Main\x00\x00\x00\x00"
PAR_FILE_ID = b"PAR 2.0\x00FileDesc"
PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00"
PAR_SLICE_ID = b"PAR 2.0\x00IFSC\x00\x00\x00\x00"
PAR_RECOVERY_ID = b"RecvSlic"
@dataclass
class FilePar2Info:
"""Class for keeping track of par2 information of a file"""
filename: str
hash16k: bytes
filesize: int
filehash: Optional[int] = None
has_duplicate: bool = False
def is_parfile(filename: str) -> bool:
"""Check quickly whether file has par2 signature
or if the filename has '.par2' in it
"""
if os.path.exists(filename):
try:
with open(filename, "rb") as f:
buf = f.read(8)
return buf.startswith(PAR_PKT_ID)
except:
pass
elif ".par2" in filename.lower():
return True
return False
def analyse_par2(name: str, filepath: Optional[str] = None) -> Tuple[str, int, int]:
"""Check if file is a par2-file and determine vol/block
return setname, vol, block
setname is empty when not a par2 file
"""
name = name.strip()
vol = block = 0
if m := PROBABLY_PAR2_RE.search(name):
setname = m.group(1)
vol = m.group(2)
block = m.group(3)
else:
# Base-par2 file
setname = get_basename(name).strip()
# Could not parse the filename, need deep inspection
# We already know it's a par2 from the is_parfile
if filepath:
try:
# Quick loop to find number blocks
# Assumes blocks are larger than 128 bytes
# Worst case, we only count 1, still good
with open(filepath, "rb") as f:
buf = f.read(128)
while buf:
if PAR_RECOVERY_ID in buf:
block += 1
buf = f.read(128)
except:
pass
return setname, vol, block
def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Tuple[str, Dict[str, FilePar2Info]]:
"""Get the hash table and the first-16k hash table from a PAR2 file
Return as dictionary, indexed on names or hashes for the first-16 table
The input md5of16k is modified in place and thus not returned!
Note that par2 can and will appear in random order, so the code has to collect data first
before we process them!
For a full description of the par2 specification, visit:
http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html
"""
set_id = slice_size = coeff = nr_files = None
filepar2info = {}
filecrc32 = {}
table = {}
duplicates16k = []
try:
total_size = os.path.getsize(fname)
with open(fname, "rb") as f:
while header := f.read(8):
if header == PAR_PKT_ID:
# All packages start with a header before the body
# 8 : PAR2\x00PKT
# 8 : Length of the entire packet. Must be multiple of 4. (NB: Includes length of header.)
# 16 : MD5 Hash of packet.
# 16 : Recovery Set ID.
# 16 : Type of packet.
# ?*4 : Body of Packet. Must be a multiple of 4 bytes.
# Length must be multiple of 4 and at least 20
pack_len = struct.unpack("<Q", f.read(8))[0]
if int(pack_len / 4) * 4 != pack_len or pack_len < 20:
continue
# Next 16 bytes is md5sum of this packet
md5sum = f.read(16)
# Read and check the data
# Subtract 32 because we already read these bytes of the header
data = f.read(pack_len - 32)
if md5sum != hashlib.md5(data).digest():
continue
# See if it's any of the packages we care about
par2_packet_type = data[16:32]
# Get the Recovery Set ID
set_id = data[:16].hex()
if par2_packet_type == PAR_FILE_ID:
# The FileDesc packet looks like:
# 16 : "PAR 2.0\0FileDesc"
# 16 : FileId
# 16 : Hash for full file
# 16 : Hash for first 16K
# 8 : File length
# xx : Name (multiple of 4, padded with \0 if needed)
fileid = data[32:48].hex()
if filepar2info.get(fileid):
# Already have data
continue
hash16k = data[64:80]
filesize = struct.unpack("<Q", data[80:88])[0]
filename = correct_unknown_encoding(data[88:].strip(b"\0"))
filepar2info[fileid] = FilePar2Info(filename, hash16k, filesize)
elif par2_packet_type == PAR_CREATOR_ID:
# From here until the end is the creator-text
# Useful in case of bugs in the par2-creating software
# "PAR 2.0\x00Creator\x00"
par2creator = data[32:].strip(b"\0") # Remove any trailing \0
logging.debug(
"Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator)
)
elif par2_packet_type == PAR_MAIN_ID:
# The Main packet looks like:
# 16 : "PAR 2.0\0Main"
# 8 : Slice size
# 4 : Number of files in the recovery set
slice_size = struct.unpack("<Q", data[32:40])[0]
coeff = sabctools.crc32_xpow8n(slice_size)
nr_files = struct.unpack("<I", data[40:44])[0]
elif par2_packet_type == PAR_SLICE_ID:
# "PAR 2.0\0IFSC\0\0\0\0"
fileid = data[32:48].hex()
if not filecrc32.get(fileid):
filecrc32[fileid] = []
for i in range(48, pack_len - 32, 20):
filecrc32[fileid].append(struct.unpack("<I", data[i + 16 : i + 20])[0])
# On large files, we stop after seeing all the listings
# On smaller files, we scan them fully to get the par2-creator
if total_size > SCAN_LIMIT and len(filepar2info) == nr_files:
break
# Process all the data
for fileid in filepar2info.keys():
# Sanity check
par2info = filepar2info[fileid]
if not filecrc32.get(fileid) or not nr_files or not slice_size:
logging.debug("Missing essential information for %s", par2info)
continue
# Handle also cases where slice_size is exact match for filesize
# We currently don't have an unittest for that!
slices = par2info.filesize // slice_size
slice_nr = 0
crc32 = 0
while slice_nr < slices:
crc32 = sabctools.crc32_multiply(crc32, coeff) ^ filecrc32[fileid][slice_nr]
slice_nr += 1
if tail_size := par2info.filesize % slice_size:
crc32 = sabctools.crc32_combine(
crc32, sabctools.crc32_zero_unpad(filecrc32[fileid][-1], slice_size - tail_size), tail_size
)
par2info.filehash = crc32
# We found hash data, add it to final tabel
table[par2info.filename] = par2info
# Check for md5of16k duplicates
if par2info.hash16k not in md5of16k:
md5of16k[par2info.hash16k] = par2info.filename
elif md5of16k[par2info.hash16k] != par2info.filename:
# Not unique and not already linked to this file
# Mark and remove to avoid false-renames
duplicates16k.append(par2info.hash16k)
table[par2info.filename].has_duplicate = True
except:
logging.info("Par2 parser crashed in file %s", fname)
logging.debug("Traceback: ", exc_info=True)
table = {}
set_id = None
# Have to remove duplicates at the end to make sure
# no trace is left in case of multi-duplicates
for hash16k in duplicates16k:
if hash16k in md5of16k:
old_name = md5of16k.pop(hash16k)
logging.debug("Par2-16k signature of %s not unique, discarding", old_name)
# Sort table by filename
# This is necessary because of the rare case that a set contains duplicate files.
# The crc32 quick check loops over files in the set and considered if they match an NzbFile.
# For example in a set with the packets in the order 003, 004, 001, 002 with 002 and 003 being identical files:
# We would start with 003 and the first match would be 002, therefore rename 002 to 003 overwriting the also
# downloaded 003 file.
# Finally, we would process 002 and the first unverified path will be 003 so rename 003 back to 002.
# The end result is we would have moved a single file from 002 to 003 to 002 and end up missing 003.
table = {filename: table[filename] for filename in sorted(table.keys())}
return set_id, table