Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: Use manual disambiguators for shared filenames #763

Merged
merged 8 commits into from
Sep 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 9 additions & 7 deletions lib/rouge.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ def highlight(text, lexer, formatter, &b)
load load_dir.join('rouge/text_analyzer.rb')
load load_dir.join('rouge/token.rb')

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/glob_mapping.rb')
load load_dir.join('rouge/guessers/modeline.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')

load load_dir.join('rouge/lexer.rb')
load load_dir.join('rouge/regex_lexer.rb')
load load_dir.join('rouge/template_lexer.rb')
Expand All @@ -57,6 +50,15 @@ def highlight(text, lexer, formatter, &b)
Rouge::Lexers.load_lexer(Pathname.new(f).relative_path_from(lexers_dir).to_s)
end

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/util.rb')
load load_dir.join('rouge/guessers/glob_mapping.rb')
load load_dir.join('rouge/guessers/modeline.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')
load load_dir.join('rouge/guessers/disambiguation.rb')

load load_dir.join('rouge/formatter.rb')
load load_dir.join('rouge/formatters/html.rb')
load load_dir.join('rouge/formatters/html_table.rb')
Expand Down
88 changes: 88 additions & 0 deletions lib/rouge/guessers/disambiguation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
module Rouge
module Guessers
class Disambiguation < Guesser
include Util
include Lexers

def initialize(filename, source)
@filename = File.basename(filename)
@source = source
end

def filter(lexers)
return lexers if lexers.size == 1
return lexers if lexers.size == Lexer.all.size

@analyzer = TextAnalyzer.new(get_source(@source))

self.class.disambiguators.each do |disambiguator|
next unless disambiguator.match?(@filename)

filtered = disambiguator.decide!(self)
return filtered if filtered
end

return lexers
end

def contains?(text)
return @analyzer.include?(text)
end

def matches?(re)
return !!(@analyzer =~ re)
end

@disambiguators = []
def self.disambiguate(*patterns, &decider)
@disambiguators << Disambiguator.new(patterns, &decider)
end

def self.disambiguators
@disambiguators
end

class Disambiguator
include Util

def initialize(patterns, &decider)
@patterns = patterns
@decider = decider
end

def decide!(guesser)
out = guesser.instance_eval(&@decider)
case out
when Array then out
when nil then nil
else [out]
end
end

def match?(filename)
@patterns.any? { |p| test_glob(p, filename) }
end
end

disambiguate '*.pl' do
next Perl if contains?('my $')
next Prolog if contains?(':-')
next Prolog if matches?(/\A\w+(\(\w+\,\s*\w+\))*\./)
end

disambiguate '*.h' do
next ObjectiveC if matches?(/@(end|implementation|protocol|property)\b/)
next ObjectiveC if contains?('@"')

C
end

disambiguate '*.m' do
next ObjectiveC if matches?(/@(end|implementation|protocol|property)\b/)
next ObjectiveC if contains?('@"')

next Matlab if matches?(/^\s*?%/)
end
end
end
end
9 changes: 3 additions & 6 deletions lib/rouge/guessers/glob_mapping.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ module Guessers
# This class allows for custom behavior
# with glob -> lexer name mappings
class GlobMapping < Guesser
include Util

def self.by_pairs(mapping, filename)
glob_map = {}
mapping.each do |(glob, lexer_name)|
Expand All @@ -29,18 +31,13 @@ def filter(lexers)

collect_best(lexers) do |lexer|
score = (@glob_map[lexer.name] || []).map do |pattern|
if test_pattern(pattern, basename)
if test_glob(pattern, basename)
# specificity is better the fewer wildcards there are
-pattern.scan(/[*?\[]/).size
end
end.compact.min
end
end

private
def test_pattern(pattern, path)
File.fnmatch?(pattern, path, File::FNM_DOTMATCH | File::FNM_CASEFOLD)
end
end
end
end
22 changes: 6 additions & 16 deletions lib/rouge/guessers/source.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module Rouge
module Guessers
class Source < Guesser
include Util

attr_reader :source
def initialize(source)
@source = source
Expand All @@ -11,27 +13,15 @@ def filter(lexers)
# we've already filtered to 1
return lexers if lexers.size == 1

# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
threshold = lexers.size < 10 ? 0 : 0.5

source_text = case @source
when String
@source
when ->(s){ s.respond_to? :read }
@source.read
else
raise 'invalid source'
end
source_text = get_source(@source)

Lexer.assert_utf8!(source_text)

source_text = TextAnalyzer.new(source_text)

collect_best(lexers, threshold: threshold) do |lexer|
next unless lexer.methods(false).include? :analyze_text
lexer.analyze_text(source_text)
collect_best(lexers) do |lexer|
next unless lexer.methods(false).include? :detect?
lexer.detect?(source_text) ? 1 : nil
end
end
end
Expand Down
20 changes: 20 additions & 0 deletions lib/rouge/guessers/util.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
module Rouge
module Guessers
module Util
def test_glob(pattern, path)
File.fnmatch?(pattern, path, File::FNM_DOTMATCH | File::FNM_CASEFOLD)
end

def get_source(source)
case source
when String
source
when ->(s){ s.respond_to? :read }
source.read
else
raise 'invalid source'
end
end
end
end
end
13 changes: 6 additions & 7 deletions lib/rouge/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def guesses(info={})
guessers << Guessers::Filename.new(filename) if filename
guessers << Guessers::Modeline.new(source) if source
guessers << Guessers::Source.new(source) if source
guessers << Guessers::Disambiguation.new(filename, source) if source && filename

Guesser.guess(guessers, Lexer.all)
end
Expand All @@ -148,7 +149,7 @@ def guesses(info={})
# fails, will be searched for shebangs, <!DOCTYPE ...> tags, and
# other hints.
#
# @see Lexer.analyze_text
# @see Lexer.detect?
# @see Lexer.guesses
def guess(info={})
lexers = guesses(info)
Expand Down Expand Up @@ -425,16 +426,14 @@ def stream_tokens(stream, &b)

# @abstract
#
# Return a number between 0 and 1 indicating the likelihood that
# the text given should be lexed with this lexer. The default
# implementation returns 0. Values under 0.5 will only be used
# to disambiguate filename or mimetype matches.
# Return true if there is an in-text indication (such as a shebang
# or DOCTYPE declaration) that this lexer should be used.
#
# @param [TextAnalyzer] text
# the text to be analyzed, with a couple of handy methods on it,
# like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?}
def self.analyze_text(text)
0
def self.detect?(text)
false
end
end

Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/apiblueprint.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class APIBlueprint < Markdown
filenames '*.apib'
mimetypes 'text/vnd.apiblueprint'

def self.analyze_text(text)
return 1 if text.start_with?('FORMAT: 1A\n')
end

prepend :root do
# Metadata
rule(/(\S+)(:\s*)(.*)$/) do
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/awk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class Awk < RegexLexer
filenames '*.awk'
mimetypes 'application/x-awk'

def self.analyze_text(text)
return 1 if text.shebang?('awk')
def self.detect?(text)
return true if text.shebang?('awk')
end

id = /[$a-zA-Z_][a-zA-Z0-9_]*/
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/biml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ class BIML < XML
tag 'biml'
filenames '*.biml'

def self.analyze_text(text)
return 1 if text =~ /<\s*Biml\b/
def self.detect?(text)
return true if text =~ /<\s*Biml\b/
end

prepend :root do
Expand Down
5 changes: 0 additions & 5 deletions lib/rouge/lexers/c.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def self.reserved
)
end

# high priority for filename matches
def self.analyze_text(*)
0.3
end

def self.builtins
@builtins ||= []
end
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/coffeescript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class Coffeescript < RegexLexer
title "CoffeeScript"
desc 'The Coffeescript programming language (coffeescript.org)'

def self.analyze_text(text)
return 1 if text.shebang? 'coffee'
def self.detect?(text)
return true if text.shebang? 'coffee'
end

def self.keywords
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/coq.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ class Coq < RegexLexer
tag 'coq'
mimetypes 'text/x-coq'

def self.analyze_text(text)
return 0.3 if text.include? "Require"
end

def self.gallina
@gallina ||= Set.new %w(
as fun if in let match then else return end Type Set Prop
Expand Down
8 changes: 4 additions & 4 deletions lib/rouge/lexers/diff.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ class Diff < RegexLexer
filenames '*.diff', '*.patch'
mimetypes 'text/x-diff', 'text/x-patch'

def self.analyze_text(text)
return 1 if text.start_with?('Index: ')
return 1 if text.start_with?('diff ')
return 0.9 if text.start_with?('--- ')
def self.detect?(text)
return true if text.start_with?('Index: ')
return true if text =~ %r(\Adiff[^\n]*?\ba/[^\n]*\bb/)
return true if text =~ /(---|[+][+][+]).*?\n(---|[+][+][+])/
end

state :root do
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/digdag.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Digdag < YAML

mimetypes 'application/x-digdag'

def self.analyze_text(text)
# disable YAML.analyze_text
end

# http://docs.digdag.io/operators.html
# as of digdag v0.9.10
KEYWORD_PATTERN = Regexp.union(%w(
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/erb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class ERB < TemplateLexer

filenames '*.erb', '*.erubis', '*.rhtml', '*.eruby'

def self.analyze_text(text)
return 0.4 if text =~ /<%.*%>/
end

def initialize(opts={})
@ruby_lexer = Ruby.new(opts)

Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/erlang.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Erlang < RegexLexer

mimetypes 'text/x-erlang', 'application/x-erlang'

def self.analyze_text(text)
return 0.3 if text =~ /^-module[(]\w+[)][.]/
end

keywords = %w(
after begin case catch cond end fun if
let of query receive try when
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/factor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ class Factor < RegexLexer
filenames '*.factor'
mimetypes 'text/x-factor'

def self.analyze_text(text)
return 1 if text.shebang? 'factor'
def self.detect?(text)
return true if text.shebang? 'factor'
end

def self.builtins
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/gherkin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Gherkin < RegexLexer
filenames '*.feature'
mimetypes 'text/x-gherkin'

def self.analyze_text(text)
return 1 if text.shebang? 'cucumber'
def self.detect?(text)
return true if text.shebang? 'cucumber'
end

# self-modifying method that loads the keywords file
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/go.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Go < RegexLexer

mimetypes 'text/x-go', 'application/x-go'

def self.analyze_text(text)
return 0
end

# Characters

WHITE_SPACE = /[\s\t\r\n]+/
Expand Down