Fix decoding of multibyte text with backslash (#1724)

- backslashes in multibyte encodings were mishandled as separators - decode string before validating its length (DICOM standard specifies some maximum value lengths in characters rather than bytes) - remove incorrect validation of byte values - fix decoding PersonName in convert_PN - fix handle multivalued PN - pin mypy to a fix version Co-authored-by: mrbean-bremen <hansemrbean@googlemail.com> Co-authored-by: Laurent Turek <laurent.turek@gmail.com>
pydicom · Jan 20, 2023 · 9c7f4dc · 9c7f4dc
1 parent fd5b9ff
commit 9c7f4dc
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 79 deletions.
diff --git a/pydicom/filewriter.py b/pydicom/filewriter.py
@@ -355,13 +355,6 @@ def write_string(fp: DicomIO, elem: DataElement, padding: str = ' ') -> None:
         fp.write(val)  # type: ignore[arg-type]
 
 
-def _encode_and_validate_string(vr: str, value: str,
-                                encodings: Sequence[str]) -> bytes:
-    encoded = encode_string(value, encodings)
-    validate_value(vr, encoded, config.settings.writing_validation_mode)
-    return encoded
-
-
 def write_text(
     fp: DicomIO, elem: DataElement, encodings: Optional[List[str]] = None
 ) -> None:
@@ -374,16 +367,15 @@ def write_text(
             if isinstance(val[0], str):
                 val = cast(Sequence[str], val)
                 val = b'\\'.join(
-                    [_encode_and_validate_string(elem.VR, val, encodings)
-                     for val in val]
+                    [encode_string(val, encodings) for val in val]
                 )
             else:
                 val = cast(Sequence[bytes], val)
                 val = b'\\'.join([val for val in val])
         else:
             val = cast(Union[bytes, str], val)
             if isinstance(val, str):
-                val = _encode_and_validate_string(elem.VR, val, encodings)
+                val = encode_string(val, encodings)
 
         if len(val) % 2 != 0:
             val = val + b' '  # pad to even length

diff --git a/pydicom/tests/test_config.py b/pydicom/tests/test_config.py
@@ -148,15 +148,6 @@ def test_invalid_keyword_raise(self, future_setter):
         with pytest.raises(ValueError):
             ds.bitsStored = 42
 
-    def test_write_invalid_values(self, future_setter):
-        ds = Dataset()
-        ds.is_implicit_VR = True
-        ds.is_little_endian = True
-        ds.SpecificCharacterSet = "ISO_IR 192"
-        ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
-        with pytest.raises(ValueError):
-            ds.save_as(DicomBytesIO())
-
 
 class TestSettings:
     @pytest.fixture
@@ -177,22 +168,3 @@ def test_reading_validation_mode_with_enforce_valid_values(
                              0, True, True)
         with pytest.raises(KeyError):
             DataElement_from_raw(raw)
-
-    def test_default_for_writing_validation_mode(self):
-        ds = Dataset()
-        ds.is_implicit_VR = True
-        ds.is_little_endian = True
-        ds.SpecificCharacterSet = "ISO_IR 192"
-        ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
-        with pytest.warns(UserWarning):
-            ds.save_as(DicomBytesIO())
-
-    def test_writing_validation_mode_with_enforce_valid_values(
-            self, enforce_valid_values):
-        ds = Dataset()
-        ds.is_implicit_VR = True
-        ds.is_little_endian = True
-        ds.SpecificCharacterSet = "ISO_IR 192"
-        ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
-        with pytest.raises(ValueError):
-            ds.save_as(DicomBytesIO())
diff --git a/pydicom/tests/test_dataelem.py b/pydicom/tests/test_dataelem.py
@@ -936,50 +936,41 @@ def test_invalid_pn_value_type(self, value, value_type):
     def test_valid_pn(self, value):
         self.check_valid_vr("PN", value)
 
-    def test_write_invalid_length_non_ascii_text(
+    def test_write_valid_length_non_ascii_text(
             self, enforce_writing_invalid_values):
         fp = DicomBytesIO()
         ds = Dataset()
         ds.is_implicit_VR = True
         ds.is_little_endian = True
         ds.SpecificCharacterSet = "ISO_IR 192"  # UTF-8
-        # the string length is 9, so constructing the data element
-        # is possible
         ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
+        # shall not raise, as the number of characters is considered,
+        # not the number of bytes (which is > 16)
+        dcmread(fp, force=True)
 
-        # encoding the element during writing shall fail,
-        # as the encoded length is 21, while only 16 bytes are allowed for SH
-        msg = r"The value length \(21\) exceeds the maximum length of 16 *"
-        with pytest.raises(ValueError, match=msg):
-            ds.save_as(fp)
-
-    def test_write_invalid_non_ascii_pn(self, enforce_writing_invalid_values):
+    def test_write_valid_non_ascii_pn(self, enforce_writing_invalid_values):
         fp = DicomBytesIO()
         ds = Dataset()
         ds.is_implicit_VR = False
         ds.is_little_endian = True
         ds.SpecificCharacterSet = "ISO_IR 192"  # UTF-8
         # string length is 40
         ds.add(DataElement(0x00100010, "PN", "洪^吉洞" * 10))
+        # shall not raise, as the number of characters is considered,
+        # not the number of bytes (which is > 64)
+        ds.save_as(fp)
 
-        msg = r"The PN component length \(100\) exceeds the maximum allowed *"
-        with pytest.raises(ValueError, match=msg):
-            ds.save_as(fp)
-
-    def test_read_invalid_length_non_ascii_text(self):
+    def test_read_valid_length_non_ascii_text(self):
         fp = DicomBytesIO()
         ds = Dataset()
         ds.is_implicit_VR = True
         ds.is_little_endian = True
         ds.SpecificCharacterSet = "ISO_IR 192"  # UTF-8
         ds.add(DataElement(0x00080050, "SH", "洪^吉洞=홍^길동"))
-        # disable value validation to write an invalid value
-        with config.disable_value_validation():
-            ds.save_as(fp)
-
-        # no warning will be issued during reading, as only RawDataElement
-        # objects are read
-        ds = dcmread(fp, force=True)
+        # shall not raise, as the number of characters is considered,
+        # not the number of bytes (which is > 16)
+        ds.save_as(fp)
+        dcmread(fp, force=True)
 
     @pytest.mark.parametrize("value, value_type", [
         ("1", "str"), (1.5, "float"), (complex(1, 2), "complex")])

diff --git a/pydicom/tests/test_values.py b/pydicom/tests/test_values.py
@@ -7,7 +7,7 @@
 from pydicom.tag import Tag
 from pydicom.values import (
     convert_value, converters, convert_tag, convert_ATvalue, convert_DA_string,
-    convert_text, convert_single_string, convert_AE_string
+    convert_text, convert_single_string, convert_AE_string, convert_PN
 )
 from pydicom.valuerep import VR
 
@@ -80,6 +80,14 @@ def test_single_value_with_backslash(self):
         assert 'Buc^Jérôme\\Διονυσιος\\Люкceмбypг' == convert_single_string(
             bytestring, encodings)
 
+    def test_multi_value_with_backslash_in_multibyte_encoding(self):
+        """Test that backslash in multibyte encoding is not mishandled as
+           value separator
+        """
+        bytestring = b'\x1b$BG\\<\\\x1b-A\\\x1b$BK\\L\\\x1b-A'
+        encodings = ['latin_1', 'iso2022_jp']
+        assert ['倍尺', '本目'] == convert_text(bytestring, encodings)
+
     def test_single_value_with_unknown_encoding(self):
         bytestring = b'Buc^J\xe9r\xf4me'
         encodings = ['unknown']
@@ -218,6 +226,35 @@ def test_convert_of(self):
         assert b'\x00\x01\x02\x03' == converters['OF'](fp, True)
 
 
+class TestConvertPN:
+    def test_valid_PN(self):
+        bytestring = b'John^Doe'
+        assert 'John^Doe' == convert_PN(bytestring)
+
+        bytestring = b'Yamada^Tarou=\x1b$B;3ED\x1b(B^\x1b$BB@O:'
+        converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
+        assert 'Yamada^Tarou=山田^太郎' == converted_string
+
+        # 0x5c (\) in multibyte codes
+        bytestring = b'Baishaku^Honme=\x1b$BG\\<\\\x1b(B^\x1b$BK\\L\\'
+        converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
+        assert 'Baishaku^Honme=倍尺^本目' == converted_string
+
+        # 0x3d (=) in multibyte codes
+        bytestring = b'Sunatouge^Ayano=\x1b$B:=F=\x1b(B^\x1b$B0=G='
+        converted_string = convert_PN(bytestring, ['latin_1', 'iso2022_jp'])
+        assert 'Sunatouge^Ayano=砂峠^綾能' == converted_string
+
+    def test_multi_value(self):
+        """Test that backslash is handled as value separator"""
+        bytestring = (b'Buc^J\xe9r\xf4me\\\x1b\x2d\x46'
+                      b'\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2\\'
+                      b'\x1b$BG\\<\\\x1b(B^\x1b$BK\\L\\')
+        encodings = ['latin_1', 'iso2022_jp', 'iso_ir_126']
+        assert ['Buc^Jérôme', 'Διονυσιος', '倍尺^本目'] == convert_PN(
+            bytestring, encodings)
+
+
 def test_all_converters():
     """Test that the VR decoder functions are complete"""
     assert set(VR) == set(converters)
diff --git a/pydicom/valuerep.py b/pydicom/valuerep.py
@@ -1483,7 +1483,6 @@ def _encode_personname(
             encode_string(group, encodings) for group in comp.split('^')
         ]
         encoded_comp = b'^'.join(groups)
-        validate_pn_component(encoded_comp)
         encoded_comps.append(encoded_comp)
 
     # Remove empty elements from the end
@@ -1834,10 +1833,7 @@ def _encode_component_groups(
         from pydicom.charset import encode_string, decode_bytes
 
         def enc(s: str) -> bytes:
-            b = encode_string(s, encodings or [default_encoding])
-            validate_value("PN", b, config.settings.writing_validation_mode,
-                           validate_pn_component_length)
-            return b
+            return encode_string(s, encodings or [default_encoding])
 
         def dec(s: bytes) -> str:
             return decode_bytes(s, encodings or [default_encoding], set())

diff --git a/pydicom/values.py b/pydicom/values.py
@@ -466,14 +466,22 @@ def convert_PN(
     valuerep.PersonName or MultiValue of PersonName
         The decoded 'PN' value(s).
     """
-    def get_valtype(x: bytes) -> PersonName:
-        return PersonName(x, encodings).decode()
 
-    b_split = byte_string.rstrip(b'\x00 ').split(b'\\')
-    if len(b_split) == 1:
-        return get_valtype(b_split[0])
+    encodings = encodings or [default_encoding]
+
+    def get_valtype(x: str) -> PersonName:
+        person_name = PersonName(x, encodings)
+        # Using an already decoded string in PersonName constructor leaves
+        # the original string as undefined, let's set it through encode
+        person_name.encode()
+        return person_name.decode()
 
-    return MultiValue(get_valtype, b_split)
+    stripped_string = byte_string.rstrip(b'\x00 ')
+    decoded_value = decode_bytes(stripped_string, encodings, TEXT_VR_DELIMS)
+    value_splitted = decoded_value.split('\\')
+    if len(value_splitted) == 1:
+        return get_valtype(value_splitted[0])
+    return MultiValue(get_valtype, value_splitted)
 
 
 def convert_string(
@@ -525,12 +533,18 @@ def convert_text(
     str or list of str
         The decoded value(s).
     """
-    values = byte_string.split(b'\\')
-    as_strings = [convert_single_string(value, encodings, vr)
-                  for value in values]
+    def handle_value(v: str) -> str:
+        if vr is not None:
+            validate_value(
+                vr, v, config.settings.reading_validation_mode)
+        return v.rstrip('\0 ')
+
+    encodings = encodings or [default_encoding]
+    decoded_string = decode_bytes(byte_string, encodings, TEXT_VR_DELIMS)
+    values = decoded_string.split("\\")
+    as_strings = [handle_value(value) for value in values]
     if len(as_strings) == 1:
         return as_strings[0]
-
     return MultiValue(str, as_strings,
                       validation_mode=config.settings.reading_validation_mode)
 
@@ -555,11 +569,11 @@ def convert_single_string(
     str
         The decoded text.
     """
-    if vr is not None:
-        validate_value(
-            vr, byte_string, config.settings.reading_validation_mode)
     encodings = encodings or [default_encoding]
     value = decode_bytes(byte_string, encodings, TEXT_VR_DELIMS)
+    if vr is not None:
+        validate_value(
+            vr, value, config.settings.reading_validation_mode)
     return value.rstrip('\0 ')