From c70d3d3e64c36a0c1e042d5e7362bb0e26a4ce40 Mon Sep 17 00:00:00 2001 From: Alexander ADAM Date: Sat, 11 Oct 2025 01:54:43 +0200 Subject: [PATCH 1/2] add tika regex support This will work for simple things only because we're using a different regex engine. But out of all the current regular expressions, only the one for `application/x-dbf` fails. So I guess we're good. And we can get rid of that html definition now. --- data/custom.xml | 8 + lib/marcel.rb | 1 + lib/marcel/magic.rb | 36 ++++- lib/marcel/mime_type/definitions.rb | 1 - lib/marcel/tables.rb | 50 ++++--- lib/marcel/tika_regex.rb | 61 ++++++++ script/generate_tables.rb | 16 ++ .../magic/application/x-bzip2/bzip2.bz2 | Bin 0 -> 14 bytes test/magic_test.rb | 130 +++++++++++++++++ test/tika_regex_test.rb | 137 ++++++++++++++++++ 10 files changed, 413 insertions(+), 27 deletions(-) create mode 100644 lib/marcel/tika_regex.rb create mode 100644 test/fixtures/magic/application/x-bzip2/bzip2.bz2 create mode 100644 test/tika_regex_test.rb diff --git a/data/custom.xml b/data/custom.xml index 1c6b3eb..3786640 100644 --- a/data/custom.xml +++ b/data/custom.xml @@ -1,4 +1,12 @@ + + + + + + + + diff --git a/lib/marcel.rb b/lib/marcel.rb index 151ebab..dbecee5 100644 --- a/lib/marcel.rb +++ b/lib/marcel.rb @@ -2,6 +2,7 @@ module Marcel require "marcel/version" + require "marcel/tika_regex" require "marcel/magic" require "marcel/mime_type" end diff --git a/lib/marcel/magic.rb b/lib/marcel/magic.rb index 60e3387..453545d 100644 --- a/lib/marcel/magic.rb +++ b/lib/marcel/magic.rb @@ -126,9 +126,14 @@ def self.magic_match(io, method) def self.magic_match_io(io, matches, buffer) matches.any? do |offset, value, children| + # Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all) + next false if value.nil? + match = if value - if Range === offset + if value.is_a?(Regexp) + match_regex(io, offset, value, buffer) + elsif Range === offset io.read(offset.begin, buffer) x = io.read(offset.end - offset.begin + value.bytesize, buffer) x && x.include?(value) @@ -143,6 +148,33 @@ def self.magic_match_io(io, matches, buffer) end end - private_class_method :magic_match, :magic_match_io + def self.match_regex(io, offset, regexp, buffer) + start = offset.is_a?(Range) ? offset.begin : offset + io.read(start, buffer) if start > 0 + data = io.read(256, buffer) + return false unless data + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this match call. + # I'm open to better ideas. + begin + old_verbose = $VERBOSE + $VERBOSE = nil + + # For regex patterns, simply match within the data buffer + # The patterns themselves should be designed to match appropriately + data.match?(regexp) + ensure + $VERBOSE = old_verbose + end + + # we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException + rescue Exception => e + warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}" + false + end + + private_class_method :magic_match, :magic_match_io, :match_regex end end diff --git a/lib/marcel/mime_type/definitions.rb b/lib/marcel/mime_type/definitions.rb index 09ce72b..65db8bb 100644 --- a/lib/marcel/mime_type/definitions.rb +++ b/lib/marcel/mime_type/definitions.rb @@ -1,7 +1,6 @@ # frozen_string_literal: true Marcel::MimeType.extend "text/plain", extensions: %w( txt asc ) -Marcel::MimeType.extend "text/html", magic: [[0..64, " %w(video/ogg), } b = Hash.new { |h, k| h[k] = k.b.freeze } + r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) } # @private # :nodoc: MAGIC = [ @@ -2518,7 +2519,7 @@ module Marcel ['image/bmp', [[0, b['BM'], [[26, b["\001\000"], [[28, b["\000\000"]], [28, b["\001\000"]], [28, b["\004\000"]], [28, b["\b\000"]], [28, b["\020\000"]], [28, b["\030\000"]], [28, b[" \000"]]]]]]]], ['image/vnd.adobe.photoshop', [[0, b["8BPS\000\001"]], [0, b["8BPS\000\002"]]]], ['image/webp', [[0, b['RIFF'], [[8, b['WEBP']]]]]], - ['text/html', [[0, b['(?i)<(html|head|body|title|div)[ >]']], [0, b['(?i)]']]]], + ['text/html', [[0, r['(?i)<(html|head|body|title|div)[ >]']], [0, r['(?i)]']]]], ['image/svg+xml', [[0..4096, b[' \xHH (hex byte) + # - \\uHHHH -> \uHHHH (unicode) + # - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby) + # - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes) + # - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters) + # + # We process these specifically to avoid breaking the regex structure + processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH + .gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH + .gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference) + .gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d + .gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n + .gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[ + + # Force binary encoding to handle binary escape sequences like \xff + processed = processed.force_encoding(Encoding::BINARY) + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this Regexp compilation. + # I'm open to better ideas. + old_verbose = $VERBOSE + $VERBOSE = nil + + Regexp.new(processed, flags).freeze + rescue RegexpError + nil + ensure + $VERBOSE = old_verbose + end + end +end diff --git a/script/generate_tables.rb b/script/generate_tables.rb index 5ff10e3..ef43a3e 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -4,6 +4,7 @@ # Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic. require 'nokogiri' +require_relative '../lib/marcel/tika_regex' class String alias inspect_old inspect @@ -27,6 +28,16 @@ def inspect end end +class RegexString + def initialize(pattern) + @pattern = pattern + end + + def inspect + "r[#{@pattern.inspect}]" + end +end + def str2int(s) return s.to_i(16) if s[0..1].downcase == '0x' return s.to_i(8) if s[0..0].downcase == '0' @@ -39,6 +50,8 @@ def binary_strings(object) object.map { |o| binary_strings(o) } when String BinaryString.new(object) + when RegexString + object when Numeric, Range, nil object else @@ -65,6 +78,8 @@ def get_matches(mime, parent) offset = offset.size == 2 ? offset[0]..offset[1] : offset[0] case type + when 'regex' + value = RegexString.new(value) when 'string', 'stringignorecase' value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') } value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") } @@ -231,6 +246,7 @@ def get_matches(mime, parent) end puts " }" puts " b = Hash.new { |h, k| h[k] = k.b.freeze }" +puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }" puts " # @private" puts " # :nodoc:" puts " MAGIC = [" diff --git a/test/fixtures/magic/application/x-bzip2/bzip2.bz2 b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..b56f3b974d6a345462b5a64b15a84c9b23bb40ec GIT binary patch literal 14 TcmZ>Y%CHnKa Ruby: \x00 (null byte) + pattern = '\\\\x00\\\\x41\\\\x42' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB" + end + + test "converts Java double-escaped octal sequences" do + # Java XML: \\000 -> Ruby: \000 (null byte) + pattern = '\\\\000\\\\101\\\\102' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB (octal)" + end + + test "converts Java double-escaped unicode sequences" do + # Java XML: \\u0041 -> Ruby: \u0041 (letter A) + pattern = '\\\\u0041\\\\u0042\\\\u0043' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("ABC"), "Should match ABC" + end + + test "converts Java double-escaped character classes" do + # \\d -> \d (digit) + pattern = 'JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("JAVA PROFILE 1.0.2"), "Should match version pattern" + refute result.match?("JAVA PROFILE X.Y.Z"), "Should not match non-digits" + end + + test "converts multiple escape types in one pattern" do + pattern = '\\\\d+\\\\x00\\\\s\\\\w+' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("123\x00 test"), "Should match digits, null, whitespace, word chars" + end + + test "removes multiple dotall flags" do + pattern = '(?s)first(?s)second' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert_equal 'firstsecond', result.source + assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE + end + + test "returns nil for incompatible pattern" do + # Variable-length lookbehind is not supported in Ruby + pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_nil result, "Incompatible pattern should return nil" + end + + test "returns nil for nil input" do + result = Marcel::TikaRegex.to_ruby_regexp(nil) + assert_nil result + end + + test "returns nil for empty string" do + result = Marcel::TikaRegex.to_ruby_regexp('') + assert_nil result + end + + test "handles character class overlaps silently" do + pattern = '[a-zA-Z][A-Za-z0-9_]' + + # Capture stderr to check for warnings + old_stderr = $stderr + $stderr = StringIO.new + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + warnings = $stderr.string + $stderr = old_stderr + + assert_instance_of Regexp, result + assert_equal '', warnings, "Should not produce warnings" + end + + test "handles multiple flags" do + pattern = '(?i)(?s).*' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\n"), "Should be case-insensitive and multiline" + assert result.match?("\ntest\n"), "Should match content across lines" + end + + test "compiles all regex patterns from tika.xml" do + # MIME types with known incompatible patterns + # These patterns use Java-specific regex features not supported by Ruby + ignore_list = %w( application/x-dbf ) + + doc = Nokogiri::XML(File.new('data/tika.xml')) + patterns_by_type = {} + + # Extract all regex patterns from tika.xml + (doc/'mime-info/mime-type').each do |mime| + type = mime['type'] + + (mime/'magic/match[@type="regex"]').each do |match| + patterns_by_type[type] ||= [] + patterns_by_type[type] << match['value'] + end + end + + patterns_by_type.each do |mime_type, patterns| + patterns.each do |pattern| + next if ignore_list.include?(mime_type) + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}" + end + end + end +end From e40f7a4dc57ec90accb863ef2f003fd9ebaae741 Mon Sep 17 00:00:00 2001 From: Alexander ADAM Date: Sat, 11 Oct 2025 15:26:50 +0200 Subject: [PATCH 2/2] remove trailing mime type spaces & hprof fixture fixes #112 --- lib/marcel/tables.rb | 2 +- script/generate_tables.rb | 2 +- .../magic/application/vnd.java.hprof/minimal.hprof | Bin 0 -> 19 bytes 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 test/fixtures/magic/application/vnd.java.hprof/minimal.hprof diff --git a/lib/marcel/tables.rb b/lib/marcel/tables.rb index 2ff46df..927213c 100644 --- a/lib/marcel/tables.rb +++ b/lib/marcel/tables.rb @@ -2677,7 +2677,7 @@ module Marcel ['application/vnd.fdf', [[0, b['%FDF-']]]], ['application/vnd.iccprofile', [[36, b['acsp']]]], ['application/vnd.isac.fcs', [[0, r["FCS[1-3]\\\\.[0-9] "]]]], - ['application/vnd.java.hprof ', [[0, r["JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d\\\\u0000"]]]], + ['application/vnd.java.hprof', [[0, r["JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d\\\\u0000"]]]], ['application/vnd.java.hprof.text', [[0, r["JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d,"]]]], ['application/vnd.lotus-1-2-3;version=1', [[0, b["\000\000\002\000\004\004"]]]], ['application/vnd.lotus-1-2-3;version=2', [[0, b["\000\000\002\000\006\004\006\000\b\000"]]]], diff --git a/script/generate_tables.rb b/script/generate_tables.rb index ef43a3e..8822559 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -251,7 +251,7 @@ def get_matches(mime, parent) puts " # :nodoc:" puts " MAGIC = [" magics.each do |priority, type, matches| - puts " ['#{type}', #{binary_strings(matches).inspect}]," + puts " ['#{type.strip}', #{binary_strings(matches).inspect}]," end puts " ]" puts "end" diff --git a/test/fixtures/magic/application/vnd.java.hprof/minimal.hprof b/test/fixtures/magic/application/vnd.java.hprof/minimal.hprof new file mode 100644 index 0000000000000000000000000000000000000000..25c87fe82eddc4de38522edbe9fc42c1cb565240 GIT binary patch literal 19 acmeZr40BWn2=aIH^l?=%)HBdCWB>p$PXuBB literal 0 HcmV?d00001