Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/marcel.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

module Marcel
require "marcel/version"
require "marcel/tika_regex"
require "marcel/magic"
require "marcel/mime_type"
end
15 changes: 13 additions & 2 deletions lib/marcel/magic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ def self.magic_match_io(io, matches, buffer)
matches.any? do |offset, value, children|
match =
if value
if Range === offset
if value.is_a?(Regexp)
match_regex(io, offset, value, buffer)
elsif Range === offset
io.read(offset.begin, buffer)
x = io.read(offset.end - offset.begin + value.bytesize, buffer)
x && x.include?(value)
Expand All @@ -143,6 +145,15 @@ def self.magic_match_io(io, matches, buffer)
end
end

private_class_method :magic_match, :magic_match_io
def self.match_regex(io, offset, regexp, buffer)
start = offset.is_a?(Range) ? offset.begin : offset
io.read(start, buffer) if start > 0
data = io.read(256, buffer)
return false unless data

data.match?(regexp)
end

private_class_method :magic_match, :magic_match_io, :match_regex
end
end
1 change: 0 additions & 1 deletion lib/marcel/mime_type/definitions.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# frozen_string_literal: true

Marcel::MimeType.extend "text/plain", extensions: %w( txt asc )
Marcel::MimeType.extend "text/html", magic: [[0..64, "<!DOCTYPE HTML"], [0..64, "<!DOCTYPE html"], [0..64, "<!doctype HTML"], [0..64, "<!doctype html"]]

Marcel::MimeType.extend "application/illustrator", parents: "application/pdf"
Marcel::MimeType.extend "image/vnd.adobe.photoshop", magic: [[0, "8BPS"]], extensions: %w( psd psb )
Expand Down
43 changes: 13 additions & 30 deletions lib/marcel/tables.rb

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions lib/marcel/tika_regex.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true

module Marcel
module TikaRegex
# Apache Tika uses Java regex syntax, which has some differences from Ruby:
# - (?s) flag in Java is a mode which makes . match newlines
# In Ruby, this is equivalent to the multiline flag
# - Java uses double-escaped sequences like \\d, \\x00, \\u0041 in XML
# These need to be converted to Ruby's single-escaped format: \d, \x00, \u0041
# - Naturally, some Java regex features are not supported in Ruby (e.g., variable-length lookbehinds)
#
# This method handles the conversion and gracefully returns nil for incompatible patterns.
#
# @param pattern [String] The Tika regex pattern string
# @return [Regexp, nil] The compiled Ruby Regexp, or nil if the pattern is incompatible
def self.to_ruby_regexp(pattern)
return nil if pattern.nil? || pattern.empty?

processed = pattern.dup
flags = 0

# Converting Java's (?s) dotall flag to Ruby's multiline
if processed.include?('(?s)')
processed = processed.gsub('(?s)', '')
flags |= Regexp::MULTILINE
end

# Convert Java-style double-escaped sequences to Ruby single-escaped format
# This is more complex than a simple gsub because we need to handle:
# - \\xHH -> \xHH (hex byte)
# - \\uHHHH -> \uHHHH (unicode)
# - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby)
# - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes)
# - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters)
#
# We process these specifically to avoid breaking the regex structure
processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH
.gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH
.gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference)
.gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d
.gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n
.gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[

# Force binary encoding to handle binary escape sequences like \xff
processed = processed.force_encoding(Encoding::BINARY)

Regexp.new(processed, flags).freeze
end
end
end
23 changes: 23 additions & 0 deletions script/generate_tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic.

require 'nokogiri'
require_relative '../lib/marcel/tika_regex'

class String
alias inspect_old inspect
Expand All @@ -27,6 +28,16 @@ def inspect
end
end

class RegexString
def initialize(pattern)
@pattern = pattern
end

def inspect
"r[#{@pattern.inspect}]"
end
end

def str2int(s)
return s.to_i(16) if s[0..1].downcase == '0x'
return s.to_i(8) if s[0..0].downcase == '0'
Expand All @@ -39,6 +50,8 @@ def binary_strings(object)
object.map { |o| binary_strings(o) }
when String
BinaryString.new(object)
when RegexString
object
when Numeric, Range, nil
object
else
Expand All @@ -47,6 +60,8 @@ def binary_strings(object)
end

def get_matches(mime, parent)
well_known_regex_types = %w( application/x-bzip2 text/html )

parent.elements.map {|match|
children = get_matches(mime, match)

Expand All @@ -65,6 +80,10 @@ def get_matches(mime, parent)

offset = offset.size == 2 ? offset[0]..offset[1] : offset[0]
case type
when 'regex'
next nil unless well_known_regex_types.include?(mime['type'])

value = RegexString.new(value)
when 'string', 'stringignorecase'
value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') }
value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") }
Expand Down Expand Up @@ -115,6 +134,7 @@ def get_matches(mime, parent)
nil
else
warn "#{mime['type']}: unsupported #{type} match: #{match.to_s}"
next nil
end

children.empty? ? [offset, value] : [offset, value, children]
Expand Down Expand Up @@ -231,10 +251,13 @@ def get_matches(mime, parent)
end
puts " }"
puts " b = Hash.new { |h, k| h[k] = k.b.freeze }"
puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }"
puts " # @private"
puts " # :nodoc:"
puts " MAGIC = ["
magics.each do |priority, type, matches|
next if matches.nil? || matches.empty?

puts " ['#{type}', #{binary_strings(matches).inspect}],"
end
puts " ]"
Expand Down
Binary file added test/fixtures/magic/application/x-bzip2/bzip2.bz2
Binary file not shown.