From c70d3d3e64c36a0c1e042d5e7362bb0e26a4ce40 Mon Sep 17 00:00:00 2001 From: Alexander ADAM Date: Sat, 11 Oct 2025 01:54:43 +0200 Subject: [PATCH 1/2] add tika regex support This will work for simple things only because we're using a different regex engine. But out of all the current regular expressions, only the one for `application/x-dbf` fails. So I guess we're good. And we can get rid of that html definition now. --- data/custom.xml | 8 + lib/marcel.rb | 1 + lib/marcel/magic.rb | 36 ++++- lib/marcel/mime_type/definitions.rb | 1 - lib/marcel/tables.rb | 50 ++++--- lib/marcel/tika_regex.rb | 61 ++++++++ script/generate_tables.rb | 16 ++ .../magic/application/x-bzip2/bzip2.bz2 | Bin 0 -> 14 bytes test/magic_test.rb | 130 +++++++++++++++++ test/tika_regex_test.rb | 137 ++++++++++++++++++ 10 files changed, 413 insertions(+), 27 deletions(-) create mode 100644 lib/marcel/tika_regex.rb create mode 100644 test/fixtures/magic/application/x-bzip2/bzip2.bz2 create mode 100644 test/tika_regex_test.rb diff --git a/data/custom.xml b/data/custom.xml index 1c6b3eb..3786640 100644 --- a/data/custom.xml +++ b/data/custom.xml @@ -1,4 +1,12 @@ + + + + + + + + diff --git a/lib/marcel.rb b/lib/marcel.rb index 151ebab..dbecee5 100644 --- a/lib/marcel.rb +++ b/lib/marcel.rb @@ -2,6 +2,7 @@ module Marcel require "marcel/version" + require "marcel/tika_regex" require "marcel/magic" require "marcel/mime_type" end diff --git a/lib/marcel/magic.rb b/lib/marcel/magic.rb index 60e3387..453545d 100644 --- a/lib/marcel/magic.rb +++ b/lib/marcel/magic.rb @@ -126,9 +126,14 @@ def self.magic_match(io, method) def self.magic_match_io(io, matches, buffer) matches.any? do |offset, value, children| + # Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all) + next false if value.nil? + match = if value - if Range === offset + if value.is_a?(Regexp) + match_regex(io, offset, value, buffer) + elsif Range === offset io.read(offset.begin, buffer) x = io.read(offset.end - offset.begin + value.bytesize, buffer) x && x.include?(value) @@ -143,6 +148,33 @@ def self.magic_match_io(io, matches, buffer) end end - private_class_method :magic_match, :magic_match_io + def self.match_regex(io, offset, regexp, buffer) + start = offset.is_a?(Range) ? offset.begin : offset + io.read(start, buffer) if start > 0 + data = io.read(256, buffer) + return false unless data + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this match call. + # I'm open to better ideas. + begin + old_verbose = $VERBOSE + $VERBOSE = nil + + # For regex patterns, simply match within the data buffer + # The patterns themselves should be designed to match appropriately + data.match?(regexp) + ensure + $VERBOSE = old_verbose + end + + # we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException + rescue Exception => e + warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}" + false + end + + private_class_method :magic_match, :magic_match_io, :match_regex end end diff --git a/lib/marcel/mime_type/definitions.rb b/lib/marcel/mime_type/definitions.rb index 09ce72b..65db8bb 100644 --- a/lib/marcel/mime_type/definitions.rb +++ b/lib/marcel/mime_type/definitions.rb @@ -1,7 +1,6 @@ # frozen_string_literal: true Marcel::MimeType.extend "text/plain", extensions: %w( txt asc ) -Marcel::MimeType.extend "text/html", magic: [[0..64, " %w(video/ogg), } b = Hash.new { |h, k| h[k] = k.b.freeze } + r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) } # @private # :nodoc: MAGIC = [ @@ -2518,7 +2519,7 @@ module Marcel ['image/bmp', [[0, b['BM'], [[26, b["\001\000"], [[28, b["\000\000"]], [28, b["\001\000"]], [28, b["\004\000"]], [28, b["\b\000"]], [28, b["\020\000"]], [28, b["\030\000"]], [28, b[" \000"]]]]]]]], ['image/vnd.adobe.photoshop', [[0, b["8BPS\000\001"]], [0, b["8BPS\000\002"]]]], ['image/webp', [[0, b['RIFF'], [[8, b['WEBP']]]]]], - ['text/html', [[0, b['(?i)<(html|head|body|title|div)[ >]']], [0, b['(?i)]']]]], + ['text/html', [[0, r['(?i)<(html|head|body|title|div)[ >]']], [0, r['(?i)]']]]], ['image/svg+xml', [[0..4096, b[' \xHH (hex byte) + # - \\uHHHH -> \uHHHH (unicode) + # - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby) + # - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes) + # - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters) + # + # We process these specifically to avoid breaking the regex structure + processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH + .gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH + .gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference) + .gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d + .gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n + .gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[ + + # Force binary encoding to handle binary escape sequences like \xff + processed = processed.force_encoding(Encoding::BINARY) + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this Regexp compilation. + # I'm open to better ideas. + old_verbose = $VERBOSE + $VERBOSE = nil + + Regexp.new(processed, flags).freeze + rescue RegexpError + nil + ensure + $VERBOSE = old_verbose + end + end +end diff --git a/script/generate_tables.rb b/script/generate_tables.rb index 5ff10e3..ef43a3e 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -4,6 +4,7 @@ # Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic. require 'nokogiri' +require_relative '../lib/marcel/tika_regex' class String alias inspect_old inspect @@ -27,6 +28,16 @@ def inspect end end +class RegexString + def initialize(pattern) + @pattern = pattern + end + + def inspect + "r[#{@pattern.inspect}]" + end +end + def str2int(s) return s.to_i(16) if s[0..1].downcase == '0x' return s.to_i(8) if s[0..0].downcase == '0' @@ -39,6 +50,8 @@ def binary_strings(object) object.map { |o| binary_strings(o) } when String BinaryString.new(object) + when RegexString + object when Numeric, Range, nil object else @@ -65,6 +78,8 @@ def get_matches(mime, parent) offset = offset.size == 2 ? offset[0]..offset[1] : offset[0] case type + when 'regex' + value = RegexString.new(value) when 'string', 'stringignorecase' value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') } value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") } @@ -231,6 +246,7 @@ def get_matches(mime, parent) end puts " }" puts " b = Hash.new { |h, k| h[k] = k.b.freeze }" +puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }" puts " # @private" puts " # :nodoc:" puts " MAGIC = [" diff --git a/test/fixtures/magic/application/x-bzip2/bzip2.bz2 b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..b56f3b974d6a345462b5a64b15a84c9b23bb40ec GIT binary patch literal 14 TcmZ>Y%CHnKa Ruby: \x00 (null byte) + pattern = '\\\\x00\\\\x41\\\\x42' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB" + end + + test "converts Java double-escaped octal sequences" do + # Java XML: \\000 -> Ruby: \000 (null byte) + pattern = '\\\\000\\\\101\\\\102' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB (octal)" + end + + test "converts Java double-escaped unicode sequences" do + # Java XML: \\u0041 -> Ruby: \u0041 (letter A) + pattern = '\\\\u0041\\\\u0042\\\\u0043' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("ABC"), "Should match ABC" + end + + test "converts Java double-escaped character classes" do + # \\d -> \d (digit) + pattern = 'JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("JAVA PROFILE 1.0.2"), "Should match version pattern" + refute result.match?("JAVA PROFILE X.Y.Z"), "Should not match non-digits" + end + + test "converts multiple escape types in one pattern" do + pattern = '\\\\d+\\\\x00\\\\s\\\\w+' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("123\x00 test"), "Should match digits, null, whitespace, word chars" + end + + test "removes multiple dotall flags" do + pattern = '(?s)first(?s)second' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert_equal 'firstsecond', result.source + assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE + end + + test "returns nil for incompatible pattern" do + # Variable-length lookbehind is not supported in Ruby + pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_nil result, "Incompatible pattern should return nil" + end + + test "returns nil for nil input" do + result = Marcel::TikaRegex.to_ruby_regexp(nil) + assert_nil result + end + + test "returns nil for empty string" do + result = Marcel::TikaRegex.to_ruby_regexp('') + assert_nil result + end + + test "handles character class overlaps silently" do + pattern = '[a-zA-Z][A-Za-z0-9_]' + + # Capture stderr to check for warnings + old_stderr = $stderr + $stderr = StringIO.new + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + warnings = $stderr.string + $stderr = old_stderr + + assert_instance_of Regexp, result + assert_equal '', warnings, "Should not produce warnings" + end + + test "handles multiple flags" do + pattern = '(?i)(?s).*' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\n"), "Should be case-insensitive and multiline" + assert result.match?("\ntest\n"), "Should match content across lines" + end + + test "compiles all regex patterns from tika.xml" do + # MIME types with known incompatible patterns + # These patterns use Java-specific regex features not supported by Ruby + ignore_list = %w( application/x-dbf ) + + doc = Nokogiri::XML(File.new('data/tika.xml')) + patterns_by_type = {} + + # Extract all regex patterns from tika.xml + (doc/'mime-info/mime-type').each do |mime| + type = mime['type'] + + (mime/'magic/match[@type="regex"]').each do |match| + patterns_by_type[type] ||= [] + patterns_by_type[type] << match['value'] + end + end + + patterns_by_type.each do |mime_type, patterns| + patterns.each do |pattern| + next if ignore_list.include?(mime_type) + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}" + end + end + end +end From 96504479a45268cc2a2d2fabb724a45c9c289cd5 Mon Sep 17 00:00:00 2001 From: Alexander ADAM Date: Tue, 14 Oct 2025 11:39:36 +0200 Subject: [PATCH 2/2] simplify regex logic and allow only tested types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplifying everything by just allowing what we are actively testing. 🎉 refs #128 --- data/custom.xml | 8 --- lib/marcel/magic.rb | 23 +------ lib/marcel/tables.rb | 39 +++-------- lib/marcel/tika_regex.rb | 13 +--- script/generate_tables.rb | 7 ++ test/magic_test.rb | 130 ------------------------------------ test/tika_regex_test.rb | 137 -------------------------------------- 7 files changed, 19 insertions(+), 338 deletions(-) delete mode 100644 test/tika_regex_test.rb diff --git a/data/custom.xml b/data/custom.xml index 3786640..1c6b3eb 100644 --- a/data/custom.xml +++ b/data/custom.xml @@ -1,12 +1,4 @@ - - - - - - - - diff --git a/lib/marcel/magic.rb b/lib/marcel/magic.rb index 453545d..56bfa86 100644 --- a/lib/marcel/magic.rb +++ b/lib/marcel/magic.rb @@ -126,9 +126,6 @@ def self.magic_match(io, method) def self.magic_match_io(io, matches, buffer) matches.any? do |offset, value, children| - # Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all) - next false if value.nil? - match = if value if value.is_a?(Regexp) @@ -154,25 +151,7 @@ def self.match_regex(io, offset, regexp, buffer) data = io.read(256, buffer) return false unless data - # I know, I know... this is awful, but the patterns come from Apache Tika - # and we are getting warnings about character class overlaps, so we'll - # suppress warnings for this match call. - # I'm open to better ideas. - begin - old_verbose = $VERBOSE - $VERBOSE = nil - - # For regex patterns, simply match within the data buffer - # The patterns themselves should be designed to match appropriately - data.match?(regexp) - ensure - $VERBOSE = old_verbose - end - - # we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException - rescue Exception => e - warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}" - false + data.match?(regexp) end private_class_method :magic_match, :magic_match_io, :match_regex diff --git a/lib/marcel/tables.rb b/lib/marcel/tables.rb index 2ff46df..34ca187 100644 --- a/lib/marcel/tables.rb +++ b/lib/marcel/tables.rb @@ -2522,7 +2522,6 @@ module Marcel ['text/html', [[0, r['(?i)<(html|head|body|title|div)[ >]']], [0, r['(?i)]']]]], ['image/svg+xml', [[0..4096, b['']]]], ['application/x-tar', [[257, b["ustar\000"]]]], ['application/x-tika-msoffice', [[0..8, b["\320\317\021\340\241\261\032\341"]]]], - ['application/x-x509-key;format=der', []], ['application/xhtml+xml', [[0..8192, b[' \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby) # - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes) # - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters) - # + # # We process these specifically to avoid breaking the regex structure processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH .gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH @@ -44,18 +44,7 @@ def self.to_ruby_regexp(pattern) # Force binary encoding to handle binary escape sequences like \xff processed = processed.force_encoding(Encoding::BINARY) - # I know, I know... this is awful, but the patterns come from Apache Tika - # and we are getting warnings about character class overlaps, so we'll - # suppress warnings for this Regexp compilation. - # I'm open to better ideas. - old_verbose = $VERBOSE - $VERBOSE = nil - Regexp.new(processed, flags).freeze - rescue RegexpError - nil - ensure - $VERBOSE = old_verbose end end end diff --git a/script/generate_tables.rb b/script/generate_tables.rb index ef43a3e..629d520 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -60,6 +60,8 @@ def binary_strings(object) end def get_matches(mime, parent) + well_known_regex_types = %w( application/x-bzip2 text/html ) + parent.elements.map {|match| children = get_matches(mime, match) @@ -79,6 +81,8 @@ def get_matches(mime, parent) offset = offset.size == 2 ? offset[0]..offset[1] : offset[0] case type when 'regex' + next nil unless well_known_regex_types.include?(mime['type']) + value = RegexString.new(value) when 'string', 'stringignorecase' value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') } @@ -130,6 +134,7 @@ def get_matches(mime, parent) nil else warn "#{mime['type']}: unsupported #{type} match: #{match.to_s}" + next nil end children.empty? ? [offset, value] : [offset, value, children] @@ -251,6 +256,8 @@ def get_matches(mime, parent) puts " # :nodoc:" puts " MAGIC = [" magics.each do |priority, type, matches| + next if matches.nil? || matches.empty? + puts " ['#{type}', #{binary_strings(matches).inspect}]," end puts " ]" diff --git a/test/magic_test.rb b/test/magic_test.rb index 1ef5f73..14d11e2 100644 --- a/test/magic_test.rb +++ b/test/magic_test.rb @@ -25,134 +25,4 @@ class Marcel::MimeType::MagicTest < Marcel::TestCase assert Marcel::Magic.child?('text/csv', 'text/plain') refute Marcel::Magic.child?('text/plain', 'text/csv') end - - test "none of the regex patterns should match random test data" do - ignore_list = %w( application/x-dbf ) - - extract_regexes = lambda do |matching_rules, collected = []| - matching_rules.each do |offset, value, children| - collected << [offset, value] if value.is_a?(Regexp) - extract_regexes.call(children, collected) if children - end - collected - end - - # Use a test string that's very unlikely to match any file format regex - # Using only high Unicode characters and very specific patterns - test_data = "🇨🇭 \xFF\xFE\x03\x05\x06🧀 cheese\x06\x07\x03" - - Marcel::MAGIC.each do |type, matching_rules| - next if ignore_list.include?(type) - regexes = extract_regexes.call(matching_rules) - - regexes.each do |offset, regex| - buffer = (+"").encode(Encoding::BINARY) - - result = Marcel::Magic.send(:match_regex, StringIO.new(test_data), offset, regex, buffer) - - assert_equal false, result, "Test data unexpectedly matched a file format regexp (#{type}, #{regex.inspect})" - end - end - end - - test "nested match: parent AND child must both match" do - # Rule: offset 0 matches "AAA" AND offset 3 matches "BBB" - # This should match "AAABBB" but not "AAA" alone - test_rules = [ - [0, "AAA".b, [[3, "BBB".b]]] - ] - - buffer = (+"").encode(Encoding::BINARY) - - # Should match when both parent and child match - io1 = StringIO.new("AAABBB") - assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), - "Should match when parent and child both match" - - # Should NOT match when parent matches but child doesn't - io2 = StringIO.new("AAAXXX") - refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), - "Should not match when parent matches but child doesn't" - end - - test "sibling matches use OR logic" do - # Two sibling rules: either can match - # Rule 1: offset 0 matches "XXX" - # Rule 2: offset 0 matches "YYY" - test_rules = [ - [0, "XXX".b], - [0, "YYY".b] - ] - - buffer = (+"").encode(Encoding::BINARY) - - # Should match via first sibling - io1 = StringIO.new("XXX") - assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), - "Should match via first sibling rule" - - # Should match via second sibling - io2 = StringIO.new("YYY") - assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), - "Should match via second sibling rule" - - # Should NOT match when no sibling matches - io3 = StringIO.new("ZZZ") - refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer), - "Should not match when no sibling rule matches" - end - - test "parent with multiple child alternatives (OR)" do - # Test complex nested structure: parent AND (child1 OR child2) - # Parent at offset 0 matches "ROOT" - # Child option 1: offset 4 matches "OPT1" - # Child option 2: offset 4 matches "OPT2" - test_rules = [ - [0, "ROOT".b, [ - [4, "OPT1".b], # First child option - [4, "OPT2".b] # Second child option (sibling OR) - ]] - ] - - buffer = (+"").encode(Encoding::BINARY) - - # Should match when parent and first child match - io1 = StringIO.new("ROOTOPT1") - assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), - "Should match when parent and first child match" - - # Should match when parent and second child match - io2 = StringIO.new("ROOTOPT2") - assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), - "Should match when parent and second child match" - - # Should NOT match when parent matches but no child matches - io3 = StringIO.new("ROOTXXXX") - refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer), - "Should not match when parent matches but no child matches" - end - - test "complex nested structure with multiple levels" do - # Parent AND (Child AND Grandchild) - # offset 0: "AAA", offset 3: "BBB", offset 6: "CCC" - test_rules = [ - [0, "AAA".b, [ - [3, "BBB".b, [ - [6, "CCC".b] - ]] - ]] - ] - - buffer = (+"").encode(Encoding::BINARY) - - # Should match when all levels match - io1 = StringIO.new("AAABBBCCC") - assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), - "Should match when all nested levels match" - - # Should NOT match when grandchild doesn't match - io2 = StringIO.new("AAABBBXXX") - refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), - "Should not match when deepest child doesn't match" - end end diff --git a/test/tika_regex_test.rb b/test/tika_regex_test.rb deleted file mode 100644 index d8ab2a3..0000000 --- a/test/tika_regex_test.rb +++ /dev/null @@ -1,137 +0,0 @@ -require 'test_helper' -require 'nokogiri' - -class TikaRegexTest < Marcel::TestCase - test "converts simple pattern" do - pattern = '^BZh[1-9]' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert_equal(/^BZh[1-9]/, result) - end - - test "converts Java double-escaped hex sequences" do - # Java XML: \\x00 -> Ruby: \x00 (null byte) - pattern = '\\\\x00\\\\x41\\\\x42' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("\x00AB"), "Should match null byte followed by AB" - end - - test "converts Java double-escaped octal sequences" do - # Java XML: \\000 -> Ruby: \000 (null byte) - pattern = '\\\\000\\\\101\\\\102' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("\x00AB"), "Should match null byte followed by AB (octal)" - end - - test "converts Java double-escaped unicode sequences" do - # Java XML: \\u0041 -> Ruby: \u0041 (letter A) - pattern = '\\\\u0041\\\\u0042\\\\u0043' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("ABC"), "Should match ABC" - end - - test "converts Java double-escaped character classes" do - # \\d -> \d (digit) - pattern = 'JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("JAVA PROFILE 1.0.2"), "Should match version pattern" - refute result.match?("JAVA PROFILE X.Y.Z"), "Should not match non-digits" - end - - test "converts multiple escape types in one pattern" do - pattern = '\\\\d+\\\\x00\\\\s\\\\w+' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("123\x00 test"), "Should match digits, null, whitespace, word chars" - end - - test "removes multiple dotall flags" do - pattern = '(?s)first(?s)second' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert_equal 'firstsecond', result.source - assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE - end - - test "returns nil for incompatible pattern" do - # Variable-length lookbehind is not supported in Ruby - pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_nil result, "Incompatible pattern should return nil" - end - - test "returns nil for nil input" do - result = Marcel::TikaRegex.to_ruby_regexp(nil) - assert_nil result - end - - test "returns nil for empty string" do - result = Marcel::TikaRegex.to_ruby_regexp('') - assert_nil result - end - - test "handles character class overlaps silently" do - pattern = '[a-zA-Z][A-Za-z0-9_]' - - # Capture stderr to check for warnings - old_stderr = $stderr - $stderr = StringIO.new - - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - warnings = $stderr.string - $stderr = old_stderr - - assert_instance_of Regexp, result - assert_equal '', warnings, "Should not produce warnings" - end - - test "handles multiple flags" do - pattern = '(?i)(?s).*' - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - - assert_instance_of Regexp, result - assert result.match?("\n"), "Should be case-insensitive and multiline" - assert result.match?("\ntest\n"), "Should match content across lines" - end - - test "compiles all regex patterns from tika.xml" do - # MIME types with known incompatible patterns - # These patterns use Java-specific regex features not supported by Ruby - ignore_list = %w( application/x-dbf ) - - doc = Nokogiri::XML(File.new('data/tika.xml')) - patterns_by_type = {} - - # Extract all regex patterns from tika.xml - (doc/'mime-info/mime-type').each do |mime| - type = mime['type'] - - (mime/'magic/match[@type="regex"]').each do |match| - patterns_by_type[type] ||= [] - patterns_by_type[type] << match['value'] - end - end - - patterns_by_type.each do |mime_type, patterns| - patterns.each do |pattern| - next if ignore_list.include?(mime_type) - - result = Marcel::TikaRegex.to_ruby_regexp(pattern) - assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}" - end - end - end -end