diff --git a/data/custom.xml b/data/custom.xml index 1c6b3eb..3786640 100644 --- a/data/custom.xml +++ b/data/custom.xml @@ -1,4 +1,12 @@ + + + + + + + + diff --git a/lib/marcel.rb b/lib/marcel.rb index 151ebab..dbecee5 100644 --- a/lib/marcel.rb +++ b/lib/marcel.rb @@ -2,6 +2,7 @@ module Marcel require "marcel/version" + require "marcel/tika_regex" require "marcel/magic" require "marcel/mime_type" end diff --git a/lib/marcel/magic.rb b/lib/marcel/magic.rb index 60e3387..453545d 100644 --- a/lib/marcel/magic.rb +++ b/lib/marcel/magic.rb @@ -126,9 +126,14 @@ def self.magic_match(io, method) def self.magic_match_io(io, matches, buffer) matches.any? do |offset, value, children| + # Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all) + next false if value.nil? + match = if value - if Range === offset + if value.is_a?(Regexp) + match_regex(io, offset, value, buffer) + elsif Range === offset io.read(offset.begin, buffer) x = io.read(offset.end - offset.begin + value.bytesize, buffer) x && x.include?(value) @@ -143,6 +148,33 @@ def self.magic_match_io(io, matches, buffer) end end - private_class_method :magic_match, :magic_match_io + def self.match_regex(io, offset, regexp, buffer) + start = offset.is_a?(Range) ? offset.begin : offset + io.read(start, buffer) if start > 0 + data = io.read(256, buffer) + return false unless data + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this match call. + # I'm open to better ideas. + begin + old_verbose = $VERBOSE + $VERBOSE = nil + + # For regex patterns, simply match within the data buffer + # The patterns themselves should be designed to match appropriately + data.match?(regexp) + ensure + $VERBOSE = old_verbose + end + + # we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException + rescue Exception => e + warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}" + false + end + + private_class_method :magic_match, :magic_match_io, :match_regex end end diff --git a/lib/marcel/mime_type/definitions.rb b/lib/marcel/mime_type/definitions.rb index 09ce72b..65db8bb 100644 --- a/lib/marcel/mime_type/definitions.rb +++ b/lib/marcel/mime_type/definitions.rb @@ -1,7 +1,6 @@ # frozen_string_literal: true Marcel::MimeType.extend "text/plain", extensions: %w( txt asc ) -Marcel::MimeType.extend "text/html", magic: [[0..64, " %w(video/ogg), } b = Hash.new { |h, k| h[k] = k.b.freeze } + r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) } # @private # :nodoc: MAGIC = [ @@ -2518,7 +2519,7 @@ module Marcel ['image/bmp', [[0, b['BM'], [[26, b["\001\000"], [[28, b["\000\000"]], [28, b["\001\000"]], [28, b["\004\000"]], [28, b["\b\000"]], [28, b["\020\000"]], [28, b["\030\000"]], [28, b[" \000"]]]]]]]], ['image/vnd.adobe.photoshop', [[0, b["8BPS\000\001"]], [0, b["8BPS\000\002"]]]], ['image/webp', [[0, b['RIFF'], [[8, b['WEBP']]]]]], - ['text/html', [[0, b['(?i)<(html|head|body|title|div)[ >]']], [0, b['(?i)]']]]], + ['text/html', [[0, r['(?i)<(html|head|body|title|div)[ >]']], [0, r['(?i)]']]]], ['image/svg+xml', [[0..4096, b[' \xHH (hex byte) + # - \\uHHHH -> \uHHHH (unicode) + # - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby) + # - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes) + # - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters) + # + # We process these specifically to avoid breaking the regex structure + processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH + .gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH + .gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference) + .gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d + .gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n + .gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[ + + # Force binary encoding to handle binary escape sequences like \xff + processed = processed.force_encoding(Encoding::BINARY) + + # I know, I know... this is awful, but the patterns come from Apache Tika + # and we are getting warnings about character class overlaps, so we'll + # suppress warnings for this Regexp compilation. + # I'm open to better ideas. + old_verbose = $VERBOSE + $VERBOSE = nil + + Regexp.new(processed, flags).freeze + rescue RegexpError + nil + ensure + $VERBOSE = old_verbose + end + end +end diff --git a/script/generate_tables.rb b/script/generate_tables.rb index 5ff10e3..8822559 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -4,6 +4,7 @@ # Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic. require 'nokogiri' +require_relative '../lib/marcel/tika_regex' class String alias inspect_old inspect @@ -27,6 +28,16 @@ def inspect end end +class RegexString + def initialize(pattern) + @pattern = pattern + end + + def inspect + "r[#{@pattern.inspect}]" + end +end + def str2int(s) return s.to_i(16) if s[0..1].downcase == '0x' return s.to_i(8) if s[0..0].downcase == '0' @@ -39,6 +50,8 @@ def binary_strings(object) object.map { |o| binary_strings(o) } when String BinaryString.new(object) + when RegexString + object when Numeric, Range, nil object else @@ -65,6 +78,8 @@ def get_matches(mime, parent) offset = offset.size == 2 ? offset[0]..offset[1] : offset[0] case type + when 'regex' + value = RegexString.new(value) when 'string', 'stringignorecase' value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') } value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") } @@ -231,11 +246,12 @@ def get_matches(mime, parent) end puts " }" puts " b = Hash.new { |h, k| h[k] = k.b.freeze }" +puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }" puts " # @private" puts " # :nodoc:" puts " MAGIC = [" magics.each do |priority, type, matches| - puts " ['#{type}', #{binary_strings(matches).inspect}]," + puts " ['#{type.strip}', #{binary_strings(matches).inspect}]," end puts " ]" puts "end" diff --git a/test/fixtures/magic/application/vnd.java.hprof/minimal.hprof b/test/fixtures/magic/application/vnd.java.hprof/minimal.hprof new file mode 100644 index 0000000..25c87fe Binary files /dev/null and b/test/fixtures/magic/application/vnd.java.hprof/minimal.hprof differ diff --git a/test/fixtures/magic/application/x-bzip2/bzip2.bz2 b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 new file mode 100644 index 0000000..b56f3b9 Binary files /dev/null and b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 differ diff --git a/test/magic_test.rb b/test/magic_test.rb index 14d11e2..1ef5f73 100644 --- a/test/magic_test.rb +++ b/test/magic_test.rb @@ -25,4 +25,134 @@ class Marcel::MimeType::MagicTest < Marcel::TestCase assert Marcel::Magic.child?('text/csv', 'text/plain') refute Marcel::Magic.child?('text/plain', 'text/csv') end + + test "none of the regex patterns should match random test data" do + ignore_list = %w( application/x-dbf ) + + extract_regexes = lambda do |matching_rules, collected = []| + matching_rules.each do |offset, value, children| + collected << [offset, value] if value.is_a?(Regexp) + extract_regexes.call(children, collected) if children + end + collected + end + + # Use a test string that's very unlikely to match any file format regex + # Using only high Unicode characters and very specific patterns + test_data = "🇨🇭 \xFF\xFE\x03\x05\x06🧀 cheese\x06\x07\x03" + + Marcel::MAGIC.each do |type, matching_rules| + next if ignore_list.include?(type) + regexes = extract_regexes.call(matching_rules) + + regexes.each do |offset, regex| + buffer = (+"").encode(Encoding::BINARY) + + result = Marcel::Magic.send(:match_regex, StringIO.new(test_data), offset, regex, buffer) + + assert_equal false, result, "Test data unexpectedly matched a file format regexp (#{type}, #{regex.inspect})" + end + end + end + + test "nested match: parent AND child must both match" do + # Rule: offset 0 matches "AAA" AND offset 3 matches "BBB" + # This should match "AAABBB" but not "AAA" alone + test_rules = [ + [0, "AAA".b, [[3, "BBB".b]]] + ] + + buffer = (+"").encode(Encoding::BINARY) + + # Should match when both parent and child match + io1 = StringIO.new("AAABBB") + assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), + "Should match when parent and child both match" + + # Should NOT match when parent matches but child doesn't + io2 = StringIO.new("AAAXXX") + refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), + "Should not match when parent matches but child doesn't" + end + + test "sibling matches use OR logic" do + # Two sibling rules: either can match + # Rule 1: offset 0 matches "XXX" + # Rule 2: offset 0 matches "YYY" + test_rules = [ + [0, "XXX".b], + [0, "YYY".b] + ] + + buffer = (+"").encode(Encoding::BINARY) + + # Should match via first sibling + io1 = StringIO.new("XXX") + assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), + "Should match via first sibling rule" + + # Should match via second sibling + io2 = StringIO.new("YYY") + assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), + "Should match via second sibling rule" + + # Should NOT match when no sibling matches + io3 = StringIO.new("ZZZ") + refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer), + "Should not match when no sibling rule matches" + end + + test "parent with multiple child alternatives (OR)" do + # Test complex nested structure: parent AND (child1 OR child2) + # Parent at offset 0 matches "ROOT" + # Child option 1: offset 4 matches "OPT1" + # Child option 2: offset 4 matches "OPT2" + test_rules = [ + [0, "ROOT".b, [ + [4, "OPT1".b], # First child option + [4, "OPT2".b] # Second child option (sibling OR) + ]] + ] + + buffer = (+"").encode(Encoding::BINARY) + + # Should match when parent and first child match + io1 = StringIO.new("ROOTOPT1") + assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), + "Should match when parent and first child match" + + # Should match when parent and second child match + io2 = StringIO.new("ROOTOPT2") + assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), + "Should match when parent and second child match" + + # Should NOT match when parent matches but no child matches + io3 = StringIO.new("ROOTXXXX") + refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer), + "Should not match when parent matches but no child matches" + end + + test "complex nested structure with multiple levels" do + # Parent AND (Child AND Grandchild) + # offset 0: "AAA", offset 3: "BBB", offset 6: "CCC" + test_rules = [ + [0, "AAA".b, [ + [3, "BBB".b, [ + [6, "CCC".b] + ]] + ]] + ] + + buffer = (+"").encode(Encoding::BINARY) + + # Should match when all levels match + io1 = StringIO.new("AAABBBCCC") + assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer), + "Should match when all nested levels match" + + # Should NOT match when grandchild doesn't match + io2 = StringIO.new("AAABBBXXX") + refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer), + "Should not match when deepest child doesn't match" + end end diff --git a/test/tika_regex_test.rb b/test/tika_regex_test.rb new file mode 100644 index 0000000..d8ab2a3 --- /dev/null +++ b/test/tika_regex_test.rb @@ -0,0 +1,137 @@ +require 'test_helper' +require 'nokogiri' + +class TikaRegexTest < Marcel::TestCase + test "converts simple pattern" do + pattern = '^BZh[1-9]' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert_equal(/^BZh[1-9]/, result) + end + + test "converts Java double-escaped hex sequences" do + # Java XML: \\x00 -> Ruby: \x00 (null byte) + pattern = '\\\\x00\\\\x41\\\\x42' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB" + end + + test "converts Java double-escaped octal sequences" do + # Java XML: \\000 -> Ruby: \000 (null byte) + pattern = '\\\\000\\\\101\\\\102' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\x00AB"), "Should match null byte followed by AB (octal)" + end + + test "converts Java double-escaped unicode sequences" do + # Java XML: \\u0041 -> Ruby: \u0041 (letter A) + pattern = '\\\\u0041\\\\u0042\\\\u0043' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("ABC"), "Should match ABC" + end + + test "converts Java double-escaped character classes" do + # \\d -> \d (digit) + pattern = 'JAVA PROFILE \\\\d\\\\.\\\\d\\\\.\\\\d' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("JAVA PROFILE 1.0.2"), "Should match version pattern" + refute result.match?("JAVA PROFILE X.Y.Z"), "Should not match non-digits" + end + + test "converts multiple escape types in one pattern" do + pattern = '\\\\d+\\\\x00\\\\s\\\\w+' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("123\x00 test"), "Should match digits, null, whitespace, word chars" + end + + test "removes multiple dotall flags" do + pattern = '(?s)first(?s)second' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert_equal 'firstsecond', result.source + assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE + end + + test "returns nil for incompatible pattern" do + # Variable-length lookbehind is not supported in Ruby + pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_nil result, "Incompatible pattern should return nil" + end + + test "returns nil for nil input" do + result = Marcel::TikaRegex.to_ruby_regexp(nil) + assert_nil result + end + + test "returns nil for empty string" do + result = Marcel::TikaRegex.to_ruby_regexp('') + assert_nil result + end + + test "handles character class overlaps silently" do + pattern = '[a-zA-Z][A-Za-z0-9_]' + + # Capture stderr to check for warnings + old_stderr = $stderr + $stderr = StringIO.new + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + warnings = $stderr.string + $stderr = old_stderr + + assert_instance_of Regexp, result + assert_equal '', warnings, "Should not produce warnings" + end + + test "handles multiple flags" do + pattern = '(?i)(?s).*' + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + + assert_instance_of Regexp, result + assert result.match?("\n"), "Should be case-insensitive and multiline" + assert result.match?("\ntest\n"), "Should match content across lines" + end + + test "compiles all regex patterns from tika.xml" do + # MIME types with known incompatible patterns + # These patterns use Java-specific regex features not supported by Ruby + ignore_list = %w( application/x-dbf ) + + doc = Nokogiri::XML(File.new('data/tika.xml')) + patterns_by_type = {} + + # Extract all regex patterns from tika.xml + (doc/'mime-info/mime-type').each do |mime| + type = mime['type'] + + (mime/'magic/match[@type="regex"]').each do |match| + patterns_by_type[type] ||= [] + patterns_by_type[type] << match['value'] + end + end + + patterns_by_type.each do |mime_type, patterns| + patterns.each do |pattern| + next if ignore_list.include?(mime_type) + + result = Marcel::TikaRegex.to_ruby_regexp(pattern) + assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}" + end + end + end +end