-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for reading DPI information from JPEG2000 images #5568
Changes from 3 commits
7f275c1
5f4653d
ae54838
3ee5a9b
0c600f1
8828080
8045ecc
dab5721
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
# | ||
# History: | ||
# 2014-03-12 ajh Created | ||
# 2021-06-30 rogermb Extract dpi information from the 'resc' header box | ||
# | ||
# Copyright (c) 2014 Coriolis Systems Limited | ||
# Copyright (c) 2014 Alastair Houghton | ||
|
@@ -19,6 +20,79 @@ | |
from . import Image, ImageFile | ||
|
||
|
||
class BoxReader: | ||
""" | ||
A small helper class to read fields stored in JPEG2000 header boxes | ||
and to easily step into and read sub-boxes. | ||
""" | ||
|
||
def __init__(self, fp, length=-1): | ||
self.fp = fp | ||
self.has_length = length >= 0 | ||
self.length = length | ||
self.remaining_in_box = -1 | ||
|
||
def _can_read(self, num_bytes): | ||
if self.remaining_in_box >= 0: | ||
# Inside box contents: ensure read does not go past box boundaries | ||
return num_bytes <= self.remaining_in_box | ||
elif self.has_length: | ||
# Outside box: ensure we don't read past the known file length | ||
return self.fp.tell() + num_bytes <= self.length | ||
else: | ||
return True # No length known, just read | ||
|
||
def _read_bytes(self, num_bytes): | ||
if not self._can_read(num_bytes): | ||
raise SyntaxError("Not enough data in header") | ||
|
||
data = self.fp.read(num_bytes) | ||
if len(data) < num_bytes: | ||
raise OSError( | ||
f"Expected to read {num_bytes} bytes but only got {len(data)}." | ||
) | ||
|
||
if self.remaining_in_box > 0: | ||
self.remaining_in_box -= num_bytes | ||
return data | ||
|
||
def read_fields(self, field_format): | ||
size = struct.calcsize(field_format) | ||
data = self._read_bytes(size) | ||
return struct.unpack(field_format, data) | ||
|
||
def read_boxes(self): | ||
size = self.remaining_in_box | ||
data = self._read_bytes(size) | ||
return BoxReader(io.BytesIO(data), size) | ||
|
||
def has_next_box(self): | ||
if self.has_length: | ||
return self.fp.tell() + self.remaining_in_box < self.length | ||
else: | ||
return True | ||
|
||
def next_box_type(self): | ||
# Skip the rest of the box if it has not been read | ||
if self.remaining_in_box > 0: | ||
self.fp.seek(self.remaining_in_box, os.SEEK_CUR) | ||
self.remaining_in_box = -1 | ||
|
||
# Read the length and type of the next box | ||
lbox, tbox = self.read_fields(">I4s") | ||
if lbox == 1: | ||
lbox = self.read_fields(">Q")[0] | ||
hlen = 16 | ||
else: | ||
hlen = 8 | ||
|
||
if lbox < hlen or not self._can_read(lbox - hlen): | ||
raise SyntaxError("Invalid header length") | ||
|
||
self.remaining_in_box = lbox - hlen | ||
return tbox | ||
|
||
|
||
def _parse_codestream(fp): | ||
"""Parse the JPEG 2000 codestream to extract the size and component | ||
count from the SIZ marker segment, returning a PIL (size, mode) tuple.""" | ||
|
@@ -53,55 +127,48 @@ def _parse_codestream(fp): | |
return (size, mode) | ||
|
||
|
||
def _res_to_dpi(num, denom, exp): | ||
"""Convert JPEG2000's (numerator, denominator, exponent-base-10) resolution, | ||
calculated as (num / denom) * 10^exp and stored in dots per meter, | ||
to floating-point dots per inch.""" | ||
if denom != 0: | ||
return num / denom * (10 ** exp) * 0.0254 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should've probably clarified this in a code comment: the reason for my roundabout way of calculating the resolution was to remain precise by working with ints for as long as possible, and to only have a single (division) operation which introduces floating point errors. While your code is more elegant and more easily readable, there are now 3 floating point operations that can introduce slight numeric errors. I'm not sure whether my approach is overkill, though. It's not like those last few mantissa bits are ever realistically going to matter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, ok. I was thinking of being consistent with how this value is used elsewhere in Pillow. I'll switch back to your version. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Awesome, thanks! 😄 |
||
|
||
|
||
def _parse_jp2_header(fp): | ||
"""Parse the JP2 header box to extract size, component count and | ||
color space information, returning a (size, mode, mimetype) tuple.""" | ||
"""Parse the JP2 header box to extract size, component count, | ||
color space information, and optionally DPI information, | ||
returning a (size, mode, mimetype, dpi) tuple.""" | ||
|
||
# Find the JP2 header box | ||
reader = BoxReader(fp) | ||
header = None | ||
mimetype = None | ||
while True: | ||
lbox, tbox = struct.unpack(">I4s", fp.read(8)) | ||
if lbox == 1: | ||
lbox = struct.unpack(">Q", fp.read(8))[0] | ||
hlen = 16 | ||
else: | ||
hlen = 8 | ||
|
||
if lbox < hlen: | ||
raise SyntaxError("Invalid JP2 header length") | ||
while reader.has_next_box(): | ||
tbox = reader.next_box_type() | ||
|
||
if tbox == b"jp2h": | ||
header = fp.read(lbox - hlen) | ||
header = reader.read_boxes() | ||
break | ||
elif tbox == b"ftyp": | ||
if fp.read(4) == b"jpx ": | ||
if reader.read_fields(">4s")[0] == b"jpx ": | ||
mimetype = "image/jpx" | ||
fp.seek(lbox - hlen - 4, os.SEEK_CUR) | ||
else: | ||
fp.seek(lbox - hlen, os.SEEK_CUR) | ||
|
||
if header is None: | ||
raise SyntaxError("could not find JP2 header") | ||
raise SyntaxError("Could not find JP2 header") | ||
|
||
size = None | ||
mode = None | ||
bpc = None | ||
nc = None | ||
dpi = None # 2-tuple of DPI info, or None | ||
unkc = 0 # Colorspace information unknown | ||
|
||
hio = io.BytesIO(header) | ||
while True: | ||
lbox, tbox = struct.unpack(">I4s", hio.read(8)) | ||
if lbox == 1: | ||
lbox = struct.unpack(">Q", hio.read(8))[0] | ||
hlen = 16 | ||
else: | ||
hlen = 8 | ||
|
||
content = hio.read(lbox - hlen) | ||
while header.has_next_box(): | ||
tbox = header.next_box_type() | ||
|
||
if tbox == b"ihdr": | ||
height, width, nc, bpc, c, unkc, ipr = struct.unpack(">IIHBBBB", content) | ||
height, width, nc, bpc, c, unkc, ipr = header.read_fields(">IIHBBBB") | ||
size = (width, height) | ||
if unkc: | ||
if nc == 1 and (bpc & 0x7F) > 8: | ||
|
@@ -114,11 +181,10 @@ def _parse_jp2_header(fp): | |
mode = "RGB" | ||
elif nc == 4: | ||
mode = "RGBA" | ||
break | ||
elif tbox == b"colr": | ||
meth, prec, approx = struct.unpack_from(">BBB", content) | ||
if meth == 1: | ||
cs = struct.unpack_from(">I", content, 3)[0] | ||
meth, prec, approx = header.read_fields(">BBB") | ||
if meth == 1 and unkc == 0: | ||
radarhere marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cs = header.read_fields(">I")[0] | ||
if cs == 16: # sRGB | ||
if nc == 1 and (bpc & 0x7F) > 8: | ||
mode = "I;16" | ||
|
@@ -128,26 +194,33 @@ def _parse_jp2_header(fp): | |
mode = "RGB" | ||
elif nc == 4: | ||
mode = "RGBA" | ||
break | ||
elif cs == 17: # grayscale | ||
if nc == 1 and (bpc & 0x7F) > 8: | ||
mode = "I;16" | ||
elif nc == 1: | ||
mode = "L" | ||
elif nc == 2: | ||
mode = "LA" | ||
break | ||
elif cs == 18: # sYCC | ||
if nc == 3: | ||
mode = "RGB" | ||
elif nc == 4: | ||
mode = "RGBA" | ||
break | ||
elif tbox == b"res ": | ||
res = header.read_boxes() | ||
while res.has_next_box(): | ||
tres = res.next_box_type() | ||
if tres == b"resc": | ||
vrcn, vrcd, hrcn, hrcd, vrce, hrce = res.read_fields(">HHHHBB") | ||
hres = _res_to_dpi(hrcn, hrcd, hrce) | ||
vres = _res_to_dpi(vrcn, vrcd, vrce) | ||
if hres is not None and vres is not None: | ||
dpi = (hres, vres) | ||
|
||
if size is None or mode is None: | ||
raise SyntaxError("Malformed jp2 header") | ||
raise SyntaxError("Malformed JP2 header") | ||
|
||
return (size, mode, mimetype) | ||
return (size, mode, mimetype, dpi) | ||
|
||
|
||
## | ||
|
@@ -169,7 +242,9 @@ def _open(self): | |
if sig == b"\x00\x00\x00\x0cjP \x0d\x0a\x87\x0a": | ||
self.codec = "jp2" | ||
header = _parse_jp2_header(self.fp) | ||
self._size, self.mode, self.custom_mimetype = header | ||
self._size, self.mode, self.custom_mimetype, dpi = header | ||
if dpi is not None: | ||
self.info["dpi"] = dpi | ||
else: | ||
raise SyntaxError("not a JPEG 2000 file") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could technically also be an elif for symmetry and whatnot -- not that it matters, since we return from the previous if branch anyway. 👍 on spotting and fixing the bug where we could read past the parent box length!