Skip to content

Commit

Permalink
Fix decoding of multibyte text with backslash (#1724)
Browse files Browse the repository at this point in the history
- backslashes in multibyte encodings were mishandled as separators
- decode string before validating its length
 (DICOM standard specifies some maximum value lengths in characters rather than bytes) 
- remove incorrect validation of byte values
- fix decoding PersonName in convert_PN
- fix handle multivalued PN
- pin mypy to a fix version

Co-authored-by: mrbean-bremen <hansemrbean@googlemail.com>
Co-authored-by: Laurent Turek <laurent.turek@gmail.com>
  • Loading branch information
3 people committed Jan 20, 2023
1 parent fd5b9ff commit 9c7f4dc
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 79 deletions.
12 changes: 2 additions & 10 deletions pydicom/filewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,6 @@ def write_string(fp: DicomIO, elem: DataElement, padding: str = ' ') -> None:
fp.write(val) # type: ignore[arg-type]


def _encode_and_validate_string(vr: str, value: str,
encodings: Sequence[str]) -> bytes:
encoded = encode_string(value, encodings)
validate_value(vr, encoded, config.settings.writing_validation_mode)
return encoded


def write_text(
fp: DicomIO, elem: DataElement, encodings: Optional[List[str]] = None
) -> None:
Expand All @@ -374,16 +367,15 @@ def write_text(
if isinstance(val[0], str):
val = cast(Sequence[str], val)
val = b'\\'.join(
[_encode_and_validate_string(elem.VR, val, encodings)
for val in val]
[encode_string(val, encodings) for val in val]
)
else:
val = cast(Sequence[bytes], val)
val = b'\\'.join([val for val in val])
else:
val = cast(Union[bytes, str], val)
if isinstance(val, str):
val = _encode_and_validate_string(elem.VR, val, encodings)
val = encode_string(val, encodings)

if len(val) % 2 != 0:
val = val + b' ' # pad to even length
Expand Down
28 changes: 0 additions & 28 deletions pydicom/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,6 @@ def test_invalid_keyword_raise(self, future_setter):
with pytest.raises(ValueError):
ds.bitsStored = 42

def test_write_invalid_values(self, future_setter):
ds = Dataset()
ds.is_implicit_VR = True
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192"
ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
with pytest.raises(ValueError):
ds.save_as(DicomBytesIO())


class TestSettings:
@pytest.fixture
Expand All @@ -177,22 +168,3 @@ def test_reading_validation_mode_with_enforce_valid_values(
0, True, True)
with pytest.raises(KeyError):
DataElement_from_raw(raw)

def test_default_for_writing_validation_mode(self):
ds = Dataset()
ds.is_implicit_VR = True
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192"
ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
with pytest.warns(UserWarning):
ds.save_as(DicomBytesIO())

def test_writing_validation_mode_with_enforce_valid_values(
self, enforce_valid_values):
ds = Dataset()
ds.is_implicit_VR = True
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192"
ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
with pytest.raises(ValueError):
ds.save_as(DicomBytesIO())
35 changes: 13 additions & 22 deletions pydicom/tests/test_dataelem.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,50 +936,41 @@ def test_invalid_pn_value_type(self, value, value_type):
def test_valid_pn(self, value):
self.check_valid_vr("PN", value)

def test_write_invalid_length_non_ascii_text(
def test_write_valid_length_non_ascii_text(
self, enforce_writing_invalid_values):
fp = DicomBytesIO()
ds = Dataset()
ds.is_implicit_VR = True
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192" # UTF-8
# the string length is 9, so constructing the data element
# is possible
ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
# shall not raise, as the number of characters is considered,
# not the number of bytes (which is > 16)
dcmread(fp, force=True)

# encoding the element during writing shall fail,
# as the encoded length is 21, while only 16 bytes are allowed for SH
msg = r"The value length \(21\) exceeds the maximum length of 16 *"
with pytest.raises(ValueError, match=msg):
ds.save_as(fp)

def test_write_invalid_non_ascii_pn(self, enforce_writing_invalid_values):
def test_write_valid_non_ascii_pn(self, enforce_writing_invalid_values):
fp = DicomBytesIO()
ds = Dataset()
ds.is_implicit_VR = False
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192" # UTF-8
# string length is 40
ds.add(DataElement(0x00100010, "PN", "洪^吉洞" * 10))
# shall not raise, as the number of characters is considered,
# not the number of bytes (which is > 64)
ds.save_as(fp)

msg = r"The PN component length \(100\) exceeds the maximum allowed *"
with pytest.raises(ValueError, match=msg):
ds.save_as(fp)

def test_read_invalid_length_non_ascii_text(self):
def test_read_valid_length_non_ascii_text(self):
fp = DicomBytesIO()
ds = Dataset()
ds.is_implicit_VR = True
ds.is_little_endian = True
ds.SpecificCharacterSet = "ISO_IR 192" # UTF-8
ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
# disable value validation to write an invalid value
with config.disable_value_validation():
ds.save_as(fp)

# no warning will be issued during reading, as only RawDataElement
# objects are read
ds = dcmread(fp, force=True)
# shall not raise, as the number of characters is considered,
# not the number of bytes (which is > 16)
ds.save_as(fp)
dcmread(fp, force=True)

@pytest.mark.parametrize("value, value_type", [
("1", "str"), (1.5, "float"), (complex(1, 2), "complex")])
Expand Down
39 changes: 38 additions & 1 deletion pydicom/tests/test_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pydicom.tag import Tag
from pydicom.values import (
convert_value, converters, convert_tag, convert_ATvalue, convert_DA_string,
convert_text, convert_single_string, convert_AE_string
convert_text, convert_single_string, convert_AE_string, convert_PN
)
from pydicom.valuerep import VR

Expand Down Expand Up @@ -80,6 +80,14 @@ def test_single_value_with_backslash(self):
assert 'Buc^Jérôme\\Διονυσιος\\Люкceмбypг' == convert_single_string(
bytestring, encodings)

def test_multi_value_with_backslash_in_multibyte_encoding(self):
"""Test that backslash in multibyte encoding is not mishandled as
value separator
"""
bytestring = b'\x1b$BG\\<\\\x1b-A\\\x1b$BK\\L\\\x1b-A'
encodings = ['latin_1', 'iso2022_jp']
assert ['倍尺', '本目'] == convert_text(bytestring, encodings)

def test_single_value_with_unknown_encoding(self):
bytestring = b'Buc^J\xe9r\xf4me'
encodings = ['unknown']
Expand Down Expand Up @@ -218,6 +226,35 @@ def test_convert_of(self):
assert b'\x00\x01\x02\x03' == converters['OF'](fp, True)


class TestConvertPN:
def test_valid_PN(self):
bytestring = b'John^Doe'
assert 'John^Doe' == convert_PN(bytestring)

bytestring = b'Yamada^Tarou=\x1b$B;3ED\x1b(B^\x1b$BB@O:'
converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
assert 'Yamada^Tarou=山田^太郎' == converted_string

# 0x5c (\) in multibyte codes
bytestring = b'Baishaku^Honme=\x1b$BG\\<\\\x1b(B^\x1b$BK\\L\\'
converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
assert 'Baishaku^Honme=倍尺^本目' == converted_string

# 0x3d (=) in multibyte codes
bytestring = b'Sunatouge^Ayano=\x1b$B:=F=\x1b(B^\x1b$B0=G='
converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
assert 'Sunatouge^Ayano=砂峠^綾能' == converted_string

def test_multi_value(self):
"""Test that backslash is handled as value separator"""
bytestring = (b'Buc^J\xe9r\xf4me\\\x1b\x2d\x46'
b'\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2\\'
b'\x1b$BG\\<\\\x1b(B^\x1b$BK\\L\\')
encodings = ['latin_1', 'iso2022_jp', 'iso_ir_126']
assert ['Buc^Jérôme', 'Διονυσιος', '倍尺^本目'] == convert_PN(
bytestring, encodings)


def test_all_converters():
"""Test that the VR decoder functions are complete"""
assert set(VR) == set(converters)
6 changes: 1 addition & 5 deletions pydicom/valuerep.py
Original file line number Diff line number Diff line change
Expand Up @@ -1483,7 +1483,6 @@ def _encode_personname(
encode_string(group, encodings) for group in comp.split('^')
]
encoded_comp = b'^'.join(groups)
validate_pn_component(encoded_comp)
encoded_comps.append(encoded_comp)

# Remove empty elements from the end
Expand Down Expand Up @@ -1834,10 +1833,7 @@ def _encode_component_groups(
from pydicom.charset import encode_string, decode_bytes

def enc(s: str) -> bytes:
b = encode_string(s, encodings or [default_encoding])
validate_value("PN", b, config.settings.writing_validation_mode,
validate_pn_component_length)
return b
return encode_string(s, encodings or [default_encoding])

def dec(s: bytes) -> str:
return decode_bytes(s, encodings or [default_encoding], set())
Expand Down
40 changes: 27 additions & 13 deletions pydicom/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,14 +466,22 @@ def convert_PN(
valuerep.PersonName or MultiValue of PersonName
The decoded 'PN' value(s).
"""
def get_valtype(x: bytes) -> PersonName:
return PersonName(x, encodings).decode()

b_split = byte_string.rstrip(b'\x00 ').split(b'\\')
if len(b_split) == 1:
return get_valtype(b_split[0])
encodings = encodings or [default_encoding]

def get_valtype(x: str) -> PersonName:
person_name = PersonName(x, encodings)
# Using an already decoded string in PersonName constructor leaves
# the original string as undefined, let's set it through encode
person_name.encode()
return person_name.decode()

return MultiValue(get_valtype, b_split)
stripped_string = byte_string.rstrip(b'\x00 ')
decoded_value = decode_bytes(stripped_string, encodings, TEXT_VR_DELIMS)
value_splitted = decoded_value.split('\\')
if len(value_splitted) == 1:
return get_valtype(value_splitted[0])
return MultiValue(get_valtype, value_splitted)


def convert_string(
Expand Down Expand Up @@ -525,12 +533,18 @@ def convert_text(
str or list of str
The decoded value(s).
"""
values = byte_string.split(b'\\')
as_strings = [convert_single_string(value, encodings, vr)
for value in values]
def handle_value(v: str) -> str:
if vr is not None:
validate_value(
vr, v, config.settings.reading_validation_mode)
return v.rstrip('\0 ')

encodings = encodings or [default_encoding]
decoded_string = decode_bytes(byte_string, encodings, TEXT_VR_DELIMS)
values = decoded_string.split("\\")
as_strings = [handle_value(value) for value in values]
if len(as_strings) == 1:
return as_strings[0]

return MultiValue(str, as_strings,
validation_mode=config.settings.reading_validation_mode)

Expand All @@ -555,11 +569,11 @@ def convert_single_string(
str
The decoded text.
"""
if vr is not None:
validate_value(
vr, byte_string, config.settings.reading_validation_mode)
encodings = encodings or [default_encoding]
value = decode_bytes(byte_string, encodings, TEXT_VR_DELIMS)
if vr is not None:
validate_value(
vr, value, config.settings.reading_validation_mode)
return value.rstrip('\0 ')


Expand Down

0 comments on commit 9c7f4dc

Please sign in to comment.