From 2f15dfd414d8121135e3653866b02a0cfc7b4012 Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Mon, 8 Jul 2019 19:30:07 -0700 Subject: [PATCH 1/2] Add lexer for Ada 2012. This implements Ada as described in the [Consolidated Ada 2012 Language Reference Manual](http://ada-auth.org/standards/ada12_w_tc1.html). Follow the pygments example in terms of mime-type and file extensions detected, but allow for full Unicode identifiers. Ada has specific rules about the use of underscores in identifiers and number literals, and we flag any errors. Resolves #824. --- lib/rouge/demos/ada | 26 +++++++ lib/rouge/lexers/ada.rb | 146 ++++++++++++++++++++++++++++++++++++++++ spec/lexers/ada_spec.rb | 107 +++++++++++++++++++++++++++++ spec/visual/samples/ada | 129 +++++++++++++++++++++++++++++++++++ 4 files changed, 408 insertions(+) create mode 100644 lib/rouge/demos/ada create mode 100644 lib/rouge/lexers/ada.rb create mode 100644 spec/lexers/ada_spec.rb create mode 100644 spec/visual/samples/ada diff --git a/lib/rouge/demos/ada b/lib/rouge/demos/ada new file mode 100644 index 0000000000..36182fbc4c --- /dev/null +++ b/lib/rouge/demos/ada @@ -0,0 +1,26 @@ +with Ada.Directories; +with Ada.Direct_IO; +with Ada.Text_IO; + +procedure Extra_IO.Read_File (Name : String) is + + package Dirs renames Ada.Directories; + package Text_IO renames Ada.Text_IO; + + -- Get the size of the file for a new string. + Size : Natural := Natural (Dirs.Size (Name)); + subtype File_String is String (1 .. Size); + + -- Instantiate Direct_IO for our file type. + package FIO is new Ada.Direct_IO (File_String); + + File : FIO.File_Type; + Contents : File_String; + +begin + FIO.Open (File, FIO.In_File, Name); + FIO.Read (File, Contents); + FIO.Close (File); + + Text_IO.Put (Contents); +end Extra_IO.Read_File; diff --git a/lib/rouge/lexers/ada.rb b/lib/rouge/lexers/ada.rb new file mode 100644 index 0000000000..ae04cb423b --- /dev/null +++ b/lib/rouge/lexers/ada.rb @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true + +module Rouge + module Lexers + class Ada < RegexLexer + tag 'ada' + filenames '*.ada', '*.ads', '*.adb', '*.gpr' + mimetypes 'text/x-ada' + + title 'Ada' + desc 'The Ada 2012 programming language' + + # Ada identifiers are Unicode with underscores only allowed as separators. + ID = /\b[[:alpha:]](?:\p{Pc}?[[:alnum:]])*\b/ + + # Numerals can also contain underscores. + NUM = /\d(_?\d)*/ + XNUM = /\h(_?\h)*/ + EXP = /(E[-+]?#{NUM})?/i + + # Return a hash mapping lower-case identifiers to token classes. + def self.idents + @idents ||= Hash.new(Name).tap do |h| + %w( + abort abstract accept access aliased all array at begin body + case constant declare delay delta digits do else elsif end + exception exit for generic goto if in interface is limited + loop new null of others out overriding pragma private + protected raise range record renames requeue return reverse + select separate some synchronized tagged task terminate then + until use when while with + ).each {|w| h[w] = Keyword} + + %w(abs and mod not or rem xor).each {|w| h[w] = Operator::Word} + + %w( + entry function package procedure subtype type + ).each {|w| h[w] = Keyword::Declaration} + + %w( + boolean character constraint_error duration float integer + natural positive long_float long_integer long_long_float + long_long_integer program_error short_float short_integer + short_short_integer storage_error string tasking_error + wide_character wide_string wide_wide_character + wide_wide_string + ).each {|w| h[w] = Name::Builtin} + end + end + + state :whitespace do + rule %r{\s+}m, Text + rule %r{--.*$}, Comment::Single + end + + state :dquote_string do + rule %r{[^"\n]+}, Literal::String::Double + rule %r{""}, Literal::String::Escape + rule %r{"}, Literal::String::Double, :pop! + rule %r{\n}, Error, :pop! + end + + state :attr do + mixin :whitespace + rule ID, Name::Attribute, :pop! + rule %r{}, Text, :pop! + end + + # Handle a dotted name immediately following a declaration keyword. + state :decl_name do + mixin :whitespace + rule %r{body\b}i, Keyword::Declaration # package body Foo.Bar is... + rule %r{(#{ID})(\.)} do + groups Name::Namespace, Punctuation + end + # function "<=" (Left, Right: Type) is ... + rule %r{#{ID}|"(and|or|xor|/?=|<=?|>=?|\+|–|&\|/|mod|rem|\*?\*|abs|not)"}, + Name::Function, :pop! + rule %r{}, Text, :pop! + end + + # Handle a sequence of library unit names: with Ada.Foo, Ada.Bar; + state :libunit_name do + mixin :whitespace + rule ID do + token Name::Namespace + goto :libunit_sep + end + rule %r{}, Text, :pop! + end + + state :libunit_sep do + mixin :whitespace + rule %r{[.,]} do + token Punctuation + goto :libunit_name + end + rule %r{}, Text, :pop! + end + + state :root do + mixin :whitespace + + # String literals. + rule %r{'.'}, Literal::String::Char + rule %r{"[^"\n]*}, Literal::String::Double, :dquote_string + + # Real literals. + rule %r{#{NUM}\.#{NUM}#{EXP}}, Literal::Number::Float + rule %r{#{NUM}##{XNUM}\.#{XNUM}##{EXP}}, Literal::Number::Float + + # Integer literals. + rule %r{2#[01](_?[01])*##{EXP}}, Literal::Number::Bin + rule %r{8#[0-7](_?[0-7])*##{EXP}}, Literal::Number::Oct + rule %r{16##{XNUM}*##{EXP}}, Literal::Number::Hex + rule %r{#{NUM}##{XNUM}##{EXP}}, Literal::Number::Integer + rule %r{#{NUM}#\w+#}, Error + rule %r{#{NUM}#{EXP}}, Literal::Number::Integer + + # Special constructs. + rule %r{'}, Punctuation, :attr + rule %r{<<#{ID}>>}, Name::Label + + # Context clauses are tricky because the 'with' keyword is used + # for many purposes. Detect at beginning of the line only. + rule %r{^(limited\s+)?(private\s+)?with\b}i, Keyword::Namespace, :libunit_name + + # Operators and punctuation characters. + rule %r{[+*/&<=>|]|-|=>|\.\.|\*\*|[:>>|<>}, Operator + rule %r{[.,:;()]}, Punctuation + + rule ID do |m| + t = self.class.idents[m[0].downcase] + token t + if t == Keyword::Declaration + push :decl_name + end + end + + # Flag word-like things that don't match the ID pattern. + rule %r{\b(\p{Pc}|[[alpha]])\p{Word}*}, Error + end + end + end +end diff --git a/spec/lexers/ada_spec.rb b/spec/lexers/ada_spec.rb new file mode 100644 index 0000000000..7b3899d1ee --- /dev/null +++ b/spec/lexers/ada_spec.rb @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true + +describe Rouge::Lexers::Ada do + let(:subject) { Rouge::Lexers::Ada.new } + + describe 'guessing' do + include Support::Guessing + + it 'guesses by filename' do + assert_guess :filename => 'foo.ada' + assert_guess :filename => 'foo.adb' + assert_guess :filename => 'foo.ads' + end + + it 'guesses by mimetype' do + assert_guess :mimetype => 'text/x-ada' + end + end + + describe 'lexing' do + include Support::Lexing + + it 'classifies identifiers' do + assert_tokens_equal 'constant Boolean := A and B', + ['Keyword', 'constant'], + ['Text', ' '], + ['Name.Builtin', 'Boolean'], + ['Text', ' '], + ['Operator', ':='], + ['Text', ' '], + ['Name', 'A'], + ['Text', ' '], + ['Operator.Word', 'and'], + ['Text', ' '], + ['Name', 'B'] + end + + it 'accepts Unicode identifiers' do + assert_tokens_equal '東京', ['Name', '東京'] + assert_tokens_equal '0東京', ['Error', '0'], ['Name', '東京'] + assert_tokens_equal '東京0', ['Name', '東京0'] + end + + it 'rejects identifiers with double or trailing underscores' do + assert_tokens_equal '_ab', ['Error', '_ab'] + assert_tokens_equal 'a__b', ['Error', 'a__b'] + assert_tokens_equal 'a_b', ['Name', 'a_b'] + assert_tokens_equal 'ab_', ['Error', 'ab_'] + end + + it 'understands other connecting punctuation' do + assert_tokens_equal 'a﹏b', ['Name', 'a﹏b'] + assert_tokens_equal '﹏ab', ['Error', '﹏ab'] + assert_tokens_equal 'a﹏﹏b', ['Error', 'a﹏﹏b'] + assert_tokens_equal 'ab﹏', ['Error', 'ab﹏'] + end + + it 'classifies based number literals' do + assert_tokens_equal '2#0001_1110#', ['Literal.Number.Bin', '2#0001_1110#'] + assert_tokens_equal '2#0001__1110#', ['Error', '2#0001__1110#'] + assert_tokens_equal '8#1234_0000#', ['Literal.Number.Oct', '8#1234_0000#'] + assert_tokens_equal '16#abc_BBB_12#', ['Literal.Number.Hex', '16#abc_BBB_12#'] + assert_tokens_equal '4#1230000#e+5', ['Literal.Number.Integer', '4#1230000#e+5'] + assert_tokens_equal '2#0001_1110#e3', ['Literal.Number.Bin', '2#0001_1110#e3'] + + assert_tokens_equal '16#abc_BBB.12#', ['Literal.Number.Float', '16#abc_BBB.12#'] + end + + it 'recognizes exponents in integers and reals' do + assert_tokens_equal '1e6', ['Literal.Number.Integer', '1e6'] + assert_tokens_equal '123_456', ['Literal.Number.Integer', '123_456'] + assert_tokens_equal '3.14159_26', ['Literal.Number.Float', '3.14159_26'] + assert_tokens_equal '3.141_592e-20', ['Literal.Number.Float', '3.141_592e-20'] + end + + it 'highlights escape sequences inside doubly quoted strings' do + assert_tokens_equal '"Archimedes said ""Εύρηκα"""', + ['Literal.String.Double', '"Archimedes said '], + ['Literal.String.Escape', '""'], + ['Literal.String.Double', 'Εύρηκα'], + ['Literal.String.Escape', '""'], + ['Literal.String.Double', '"'] + end + + it 'marks function names in declarations' do + assert_tokens_equal 'Entry Foo IS', + ['Keyword.Declaration', 'Entry'], + ['Text', ' '], + ['Name.Function', 'Foo'], + ['Text', ' '], + ['Keyword', 'IS'] + + assert_tokens_equal 'package body Ada.Foo IS', + ['Keyword.Declaration', 'package'], + ['Text', ' '], + ['Keyword.Declaration', 'body'], + ['Text', ' '], + ['Name.Namespace', 'Ada'], + ['Punctuation', '.'], + ['Name.Function', 'Foo'], + ['Text', ' '], + ['Keyword', 'IS'] + end + end +end + diff --git a/spec/visual/samples/ada b/spec/visual/samples/ada new file mode 100644 index 0000000000..17587fb99d --- /dev/null +++ b/spec/visual/samples/ada @@ -0,0 +1,129 @@ +generic + + type Element_Type is private; + type Array_Type is array (Integer range <>) of Element_Type; + with function ">" (Left, Right : Element_Type) return Boolean is <>; + +procedure Algorithms.Best_Sort (A : in out Array_Type); + + +package Scanner is + + type Token_Type is + (Whitespace, New_Line, Ident, Equals, Left_Par, Right_Par, Pipe, Question, + Asterisk, Plus); + + -- Scanning for a token can fail, or it can return a token. + type Result (Ok : Boolean) is record + Last : Positive; + case Ok is + when True => + Token : Token_Type; + when False => + null; + end case; + end record; + + -- Identify the token at the beginning of Text. + function Scan_Token (Text : String) return Result with + Pre => Text'Length > 0; + +end Scanner; + + +package 動物園 is + type 動物 is (犬, 猫); + function いう (だれ : 動物) return Wide_Wide_String is + (case だれ is when 犬 => """ワン"", ""ワン""", when 猫 => """にゃん"""); +end 動物園; + + +procedure Algorithms.Best_Sort (A : in out Array_Type) is +begin + + if A'Length <= 1 then + return; + end if; + + <> + for I in A'First .. A'Last - 1 loop + if A (I) > A (I + 1) then + Exchange (A (I), A (I + 1)); + goto Try_Again; + end if; + end loop; + +end Algorithms.Best_Sort; + + +with Ada.Characters.Latin_1; +package body Scanner is + + package Lat1 renames Ada.Characters.Latin_1; + + function Scan_Token (Text : String) return Result is + + Pos : Positive := Text'First; + + -- Shortcut functions for returning a token or an error at the + -- current position. + function Ok (Token : Token_Type) return Result is (True, Pos, Token); + function Error return Result is (False, Pos); + + begin + case Text (Pos) is + when Lat1.LF => + return Ok (New_Line); + when '=' => + return Ok (Equals); + when '(' => + return Ok (Left_Par); + when ')' => + return Ok (Right_Par); + when '|' => + return Ok (Pipe); + when '?' => + return Ok (Question); + when '*' => + return Ok (Asterisk); + when '+' => + return Ok (Plus); + + when ' ' | Lat1.HT | Lat1.CR => + while Pos < Text'Last and Text (Pos + 1) in ' ' | Lat1.HT | Lat1.CR + loop + Pos := Pos + 1; + end loop; + return Ok (Whitespace); + + when 'A' .. 'Z' | 'a' .. 'z' => + while Pos < Text'Last and + Text (Pos + 1) in 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' + loop + Pos := Pos + 1; + end loop; + return Ok (Ident); + + when others => + return Error; + end case; + end Scan_Token; +end Scanner; + + +Package MACHINE_CODE Is + Type REGISTER Is Range 0 .. 16#F#; + Type DISPLACEMENT Is Range 0 .. 16#FFF#; + + Type SI Is Record + CODE : OPCODE; + B : REGISTER; + D : DISPLACEMENT; + End Record; + + for SI Use Record + CODE at 0 Range 0 .. 7; + B at 0 Range 16 .. 19; -- Bits 8 .. 15 Unused + D at 0 Range 20 .. 31; + End Record; +End MACHINE_CODE; From 3a7d5bc7d59e6faeb969976e1f1c83754327c75f Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Wed, 10 Jul 2019 08:28:29 -0700 Subject: [PATCH 2/2] Update Ada lexer after review by @pyrmont. The rule entering state :libunit_name was incorrectly classifying whitespace as Keyword::Namespace in `limited with`. Add a spec to make sure this is marked as Text. In the :libunit_name state, detect keywords that shouldn't be there and use those to recover quickly from the incorrectly entered state. This recovery is more robust that the previous ping-pong with :libunit_sep, so merge that state into libunit_name for simplicity. Add specs to make sure we're recovering well. --- lib/rouge/lexers/ada.rb | 40 +++++++++++++++++++--------- spec/lexers/ada_spec.rb | 59 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/lib/rouge/lexers/ada.rb b/lib/rouge/lexers/ada.rb index ae04cb423b..0e47ec71f0 100644 --- a/lib/rouge/lexers/ada.rb +++ b/lib/rouge/lexers/ada.rb @@ -81,21 +81,34 @@ def self.idents end # Handle a sequence of library unit names: with Ada.Foo, Ada.Bar; + # + # There's a chance we entered this state mistakenly since 'with' + # has multiple other uses in Ada (none of which are likely to + # appear at the beginning of a line). Try to bail as soon as + # possible if we see something suspicious like keywords. + # + # See ada_spec.rb for some examples. state :libunit_name do mixin :whitespace - rule ID do - token Name::Namespace - goto :libunit_sep - end - rule %r{}, Text, :pop! - end - state :libunit_sep do - mixin :whitespace - rule %r{[.,]} do - token Punctuation - goto :libunit_name + rule ID do |m| + t = self.class.idents[m[0].downcase] + if t <= Name + # Convert all kinds of Name to namespaces in this context. + token Name::Namespace + else + # Yikes, we're not supposed to get a keyword in a library unit name! + # We probably entered this state by mistake, so try to fix it. + token t + if t == Keyword::Declaration + goto :decl_name + else + pop! + end + end end + + rule %r{[.,]}, Punctuation rule %r{}, Text, :pop! end @@ -124,7 +137,10 @@ def self.idents # Context clauses are tricky because the 'with' keyword is used # for many purposes. Detect at beginning of the line only. - rule %r{^(limited\s+)?(private\s+)?with\b}i, Keyword::Namespace, :libunit_name + rule %r{^(?:(limited)(\s+))?(?:(private)(\s+))?(with)\b}i do + groups Keyword::Namespace, Text, Keyword::Namespace, Text, Keyword::Namespace + push :libunit_name + end # Operators and punctuation characters. rule %r{[+*/&<=>|]|-|=>|\.\.|\*\*|[:>>|<>}, Operator diff --git a/spec/lexers/ada_spec.rb b/spec/lexers/ada_spec.rb index 7b3899d1ee..e0ba1e8261 100644 --- a/spec/lexers/ada_spec.rb +++ b/spec/lexers/ada_spec.rb @@ -102,6 +102,63 @@ ['Text', ' '], ['Keyword', 'IS'] end + + it 'allows both names and builtin names in context clauses' do + assert_tokens_equal 'limited with Math.Integer;', + ['Keyword.Namespace', 'limited'], + ['Text', ' '], + ['Keyword.Namespace', 'with'], + ['Text', ' '], + ['Name.Namespace', 'Math'], + ['Punctuation', '.'], + ['Name.Namespace', 'Integer'], + ['Punctuation', ';'] + end + + it 'recovers quickly after mistakenly entering :libunit_name' do + # A `with` keyword at the beginning of a line is 99.9% sure to be + # a context clause, but there are things that could be mistaken if + # they are indented strangely: + # + # generic + # with function Random return Integer is <>; + # procedure Foo; + # + # If that `with` is not indented, we should recover immediately: + assert_tokens_equal 'with function Random', + ['Keyword.Namespace', 'with'], + ['Text', ' '], + ['Keyword.Declaration', 'function'], + ['Text', ' '], + ['Name.Function', 'Random'] + + # Another case that's even less likely to be at BOL: + # + # type Painted_Point is new Point with + # record + # Paint : Color := White; + # end record; + # + # type Addition is new Binary_Operation with null record; + assert_tokens_equal 'with record Paint', + ['Keyword.Namespace', 'with'], + ['Text', ' '], + ['Keyword', 'record'], + ['Text', ' '], + ['Name', 'Paint'] + assert_tokens_equal 'with null record;', + ['Keyword.Namespace', 'with'], + ['Text', ' '], + ['Keyword', 'null'], + ['Text', ' '], + ['Keyword', 'record'], + ['Punctuation', ';'] + + # Finally: raise Runtime_Error with "NO!" + assert_tokens_equal 'with "NO!"', + ['Keyword.Namespace', 'with'], + ['Text', ' '], + ['Literal.String.Double', '"NO!"'] + end end end -