diff --git a/lib/marcel.rb b/lib/marcel.rb index 151ebab..dbecee5 100644 --- a/lib/marcel.rb +++ b/lib/marcel.rb @@ -2,6 +2,7 @@ module Marcel require "marcel/version" + require "marcel/tika_regex" require "marcel/magic" require "marcel/mime_type" end diff --git a/lib/marcel/magic.rb b/lib/marcel/magic.rb index 60e3387..56bfa86 100644 --- a/lib/marcel/magic.rb +++ b/lib/marcel/magic.rb @@ -128,7 +128,9 @@ def self.magic_match_io(io, matches, buffer) matches.any? do |offset, value, children| match = if value - if Range === offset + if value.is_a?(Regexp) + match_regex(io, offset, value, buffer) + elsif Range === offset io.read(offset.begin, buffer) x = io.read(offset.end - offset.begin + value.bytesize, buffer) x && x.include?(value) @@ -143,6 +145,15 @@ def self.magic_match_io(io, matches, buffer) end end - private_class_method :magic_match, :magic_match_io + def self.match_regex(io, offset, regexp, buffer) + start = offset.is_a?(Range) ? offset.begin : offset + io.read(start, buffer) if start > 0 + data = io.read(256, buffer) + return false unless data + + data.match?(regexp) + end + + private_class_method :magic_match, :magic_match_io, :match_regex end end diff --git a/lib/marcel/mime_type/definitions.rb b/lib/marcel/mime_type/definitions.rb index 09ce72b..65db8bb 100644 --- a/lib/marcel/mime_type/definitions.rb +++ b/lib/marcel/mime_type/definitions.rb @@ -1,7 +1,6 @@ # frozen_string_literal: true Marcel::MimeType.extend "text/plain", extensions: %w( txt asc ) -Marcel::MimeType.extend "text/html", magic: [[0..64, " %w(video/ogg), } b = Hash.new { |h, k| h[k] = k.b.freeze } + r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) } # @private # :nodoc: MAGIC = [ @@ -2518,10 +2519,9 @@ module Marcel ['image/bmp', [[0, b['BM'], [[26, b["\001\000"], [[28, b["\000\000"]], [28, b["\001\000"]], [28, b["\004\000"]], [28, b["\b\000"]], [28, b["\020\000"]], [28, b["\030\000"]], [28, b[" \000"]]]]]]]], ['image/vnd.adobe.photoshop', [[0, b["8BPS\000\001"]], [0, b["8BPS\000\002"]]]], ['image/webp', [[0, b['RIFF'], [[8, b['WEBP']]]]]], - ['text/html', [[0, b['(?i)<(html|head|body|title|div)[ >]']], [0, b['(?i)]']]]], + ['text/html', [[0, r['(?i)<(html|head|body|title|div)[ >]']], [0, r['(?i)]']]]], ['image/svg+xml', [[0..4096, b['']]]], ['application/x-tar', [[257, b["ustar\000"]]]], ['application/x-tika-msoffice', [[0..8, b["\320\317\021\340\241\261\032\341"]]]], - ['application/x-x509-key;format=der', []], ['application/xhtml+xml', [[0..8192, b[' \xHH (hex byte) + # - \\uHHHH -> \uHHHH (unicode) + # - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby) + # - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes) + # - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters) + # + # We process these specifically to avoid breaking the regex structure + processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH + .gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH + .gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference) + .gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d + .gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n + .gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[ + + # Force binary encoding to handle binary escape sequences like \xff + processed = processed.force_encoding(Encoding::BINARY) + + Regexp.new(processed, flags).freeze + end + end +end diff --git a/script/generate_tables.rb b/script/generate_tables.rb index 5ff10e3..629d520 100755 --- a/script/generate_tables.rb +++ b/script/generate_tables.rb @@ -4,6 +4,7 @@ # Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic. require 'nokogiri' +require_relative '../lib/marcel/tika_regex' class String alias inspect_old inspect @@ -27,6 +28,16 @@ def inspect end end +class RegexString + def initialize(pattern) + @pattern = pattern + end + + def inspect + "r[#{@pattern.inspect}]" + end +end + def str2int(s) return s.to_i(16) if s[0..1].downcase == '0x' return s.to_i(8) if s[0..0].downcase == '0' @@ -39,6 +50,8 @@ def binary_strings(object) object.map { |o| binary_strings(o) } when String BinaryString.new(object) + when RegexString + object when Numeric, Range, nil object else @@ -47,6 +60,8 @@ def binary_strings(object) end def get_matches(mime, parent) + well_known_regex_types = %w( application/x-bzip2 text/html ) + parent.elements.map {|match| children = get_matches(mime, match) @@ -65,6 +80,10 @@ def get_matches(mime, parent) offset = offset.size == 2 ? offset[0]..offset[1] : offset[0] case type + when 'regex' + next nil unless well_known_regex_types.include?(mime['type']) + + value = RegexString.new(value) when 'string', 'stringignorecase' value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') } value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") } @@ -115,6 +134,7 @@ def get_matches(mime, parent) nil else warn "#{mime['type']}: unsupported #{type} match: #{match.to_s}" + next nil end children.empty? ? [offset, value] : [offset, value, children] @@ -231,10 +251,13 @@ def get_matches(mime, parent) end puts " }" puts " b = Hash.new { |h, k| h[k] = k.b.freeze }" +puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }" puts " # @private" puts " # :nodoc:" puts " MAGIC = [" magics.each do |priority, type, matches| + next if matches.nil? || matches.empty? + puts " ['#{type}', #{binary_strings(matches).inspect}]," end puts " ]" diff --git a/test/fixtures/magic/application/x-bzip2/bzip2.bz2 b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 new file mode 100644 index 0000000..b56f3b9 Binary files /dev/null and b/test/fixtures/magic/application/x-bzip2/bzip2.bz2 differ