Add Clean lexer (#1305)

This commit adds a lexer for the Clean language.
rouge-ruby · Aug 22, 2019 · 6ad8346 · 6ad8346
1 parent 8533fea
commit 6ad8346
Show file tree

Hide file tree

Showing 4 changed files with 308 additions and 0 deletions.
diff --git a/lib/rouge/demos/clean b/lib/rouge/demos/clean
@@ -0,0 +1,6 @@
+delete :: !a !.(Set a) -> Set a | < a
+delete x Tip = Tip
+delete x (Bin _ y l r)
+  | x < y     = balanceR y (delete x l) r
+  | x > y     = balanceL y l (delete x r)
+  | otherwise = glue l r
diff --git a/lib/rouge/lexers/clean.rb b/lib/rouge/lexers/clean.rb
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*- #
+# frozen_string_literal: true
+
+module Rouge
+  module Lexers
+    class Clean < RegexLexer
+      title "Clean"
+      desc "The Clean programming language (clean.cs.ru.nl)"
+
+      tag 'clean'
+      filenames '*.dcl', '*.icl'
+
+      def self.keywords
+        @keywords ||= Set.new %w(
+          if otherwise
+          let in
+          with where
+          case of
+          infix infixl infixr
+          class instance
+          generic derive
+          special
+          implementation definition system module
+          from import qualified as
+          dynamic
+          code inline foreign export ccall stdcall
+        )
+      end
+
+      # These are literal patterns common to the ABC intermediate language and
+      # Clean. Clean has more extensive literal patterns (see :basic below).
+      state :common_literals do
+        rule %r/'(?:[^'\\]|\\(?:x[0-9a-fA-F]+|\d+|.))'/, Str::Char
+
+        rule %r/[+~-]?\d+\.\d+(?:E[+-]?\d+)?\b/, Num::Float
+        rule %r/[+~-]?\d+E[+-]?\d+\b/, Num::Float
+        rule %r/[+~-]?\d+/, Num::Integer
+
+        rule %r/"/, Str::Double, :string
+      end
+
+      state :basic do
+        rule %r/\s+/m, Text::Whitespace
+
+        rule %r/\/\/\*.*/, Comment::Doc
+        rule %r/\/\/.*/, Comment::Single
+        rule %r/\/\*\*/, Comment::Doc, :comment_doc
+        rule %r/\/\*/, Comment::Multiline, :comment
+
+        rule %r/[+~-]?0[0-7]+/, Num::Oct
+        rule %r/[+~-]?0x[0-9a-fA-F]+/, Num::Hex
+        mixin :common_literals
+        rule %r/(\[)(\s*)(')(?=.*?'\])/ do
+          groups Punctuation, Text::Whitespace, Str::Single, Punctuation
+          push :charlist
+        end
+      end
+
+      # nested commenting
+      state :comment_doc do
+        rule %r/\*\//, Comment::Doc, :pop!
+        rule %r/\/\/.*/, Comment::Doc # Singleline comments in multiline comments are skipped
+        rule %r/\/\*/, Comment::Doc, :comment
+        rule %r/[^*\/]+/, Comment::Doc
+        rule %r/[*\/]/, Comment::Doc
+      end
+
+      # This is the same as the above, but with Multiline instead of Doc
+      state :comment do
+        rule %r/\*\//, Comment::Multiline, :pop!
+        rule %r/\/\/.*/, Comment::Multiline # Singleline comments in multiline comments are skipped
+        rule %r/\/\*/, Comment::Multiline, :comment
+        rule %r/[^*\/]+/, Comment::Multiline
+        rule %r/[*\/]/, Comment::Multiline
+      end
+
+      state :root do
+        mixin :basic
+
+        rule %r/code(\s+inline)?\s*{/, Comment::Preproc, :abc
+
+        rule %r/_*[a-z][\w_`]*/ do |m|
+          if self.class.keywords.include?(m[0])
+            token Keyword
+          else
+            token Name
+          end
+        end
+
+        rule %r/_*[A-Z][\w_`]*/ do |m|
+          if m[0]=='True' || m[0]=='False'
+            token Keyword::Constant
+          else
+            token Keyword::Type
+          end
+        end
+
+        rule %r/[^\w_\s`]/, Punctuation
+        rule %r/_\b/, Punctuation
+      end
+
+      state :escapes do
+        rule %r/\\x[0-9a-fA-F]{1,2}/i, Str::Escape
+        rule %r/\\d\d{0,3}/i, Str::Escape
+        rule %r/\\0[0-7]{0,3}/, Str::Escape
+        rule %r/\\[0-7]{1,3}/, Str::Escape
+        rule %r/\\[nrfbtv\\"']/, Str::Escape
+      end
+
+      state :string do
+        rule %r/"/, Str::Double, :pop!
+        mixin :escapes
+        rule %r/[^\\"]+/, Str::Double
+      end
+
+      state :charlist do
+        rule %r/(')(\])/ do
+          groups Str::Single, Punctuation
+          pop!
+        end
+        mixin :escapes
+        rule %r/[^\\']/, Str::Single
+      end
+
+      state :abc_basic do
+        rule %r/\s+/, Text::Whitespace
+        rule %r/\|.*/, Comment::Single
+        mixin :common_literals
+      end
+
+      # The ABC intermediate language can be included, similar to C's inline
+      # assembly. For some information about ABC, see:
+      # https://en.wikipedia.org/wiki/Clean_(programming_language)#The_ABC-Machine
+      state :abc do
+        mixin :abc_basic
+
+        rule %r/}/, Comment::Preproc, :pop!
+        rule %r/\.\w*/, Keyword, :abc_rest_of_line
+        rule %r/[\w_]+/, Name::Builtin, :abc_rest_of_line
+      end
+
+      state :abc_rest_of_line do
+        rule %r/\n/, Text::Whitespace, :pop!
+        rule %r/}/ do
+          token Comment::Preproc
+          pop!
+          pop!
+        end
+
+        mixin :abc_basic
+
+        rule %r/\S+/, Name
+      end
+    end
+  end
+end
diff --git a/spec/lexers/clean_spec.rb b/spec/lexers/clean_spec.rb
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*- #
+# frozen_string_literal: true
+
+describe Rouge::Lexers::Clean do
+  let(:subject) { Rouge::Lexers::Clean.new }
+
+  describe 'guessing' do
+    include Support::Guessing
+
+    it 'guesses by filename' do
+      assert_guess :filename => 'foo.dcl'
+      assert_guess :filename => 'foo.icl'
+    end
+  end
+end
diff --git a/spec/visual/samples/clean b/spec/visual/samples/clean
@@ -0,0 +1,131 @@
+implementation module Data.GenCompress
+
+// Samples for testing were taken from Clean Platform ...
+// https://gitlab.science.ru.nl/clean-and-itasks/clean-platform
+// ... but cut down for brevity.
+
+// -- The following are some additions to test features not included in the sample:
+
+literals =
+	( 'a', '\x61', '\0141', '\n'
+	, 17.42E3 // lower-case e or no digits before . are invalid!
+	, 17.42
+	, 0x11, 052, 3, 0x3ff
+	, ['charlist sugar'], [   'may start with spaces']
+	,  "escapes:       \x11 \X11 \052 \d17 \D42 \n \" \' ' \\ "
+	, ['in a charlist: \x11 \X11 \052 \d17 \D42 \n \" \' " \\ ']
+	, "unicode: ° →"
+	)
+
+/* Clean comments can be nested: /* like so */
+ * Furthermore, // in multiline comments ignore closing */ on the remainder
+ * Hence this is still part of the comment, and it only closes here: */
+
+/**
+ * By convention, documentation is in comments starting with two asterisks ...
+ */
+
+//* ... or starting with an asterisk in the case of singleline documentation.
+
+// -- end additions
+
+import StdGeneric, StdEnv
+from Data.Maybe import :: Maybe(..)
+import Data._Array, Data.Func
+
+//--------------------------------------------------
+// uncompressor monad
+
+ret :: !.a !u:CompressSt -> (!Maybe .a,!u:CompressSt)
+ret a st = (Just a, st)
+(>>=) infixl 5
+(>>=) pa pb = bind pa pb
+where
+	bind pa pb st
+		#! (ma, st) = pa st
+		= case ma of
+			Nothing -> (Nothing, st)
+			Just x  -> pb x st
+
+//--------------------------------------------------
+
+:: BitVector :== {#Int}
+:: BitPos :== Int
+
+:: CompressSt = { cs_pos :: !Int, cs_bits :: !.{#Int} }
+mkCompressSt arr = { cs_pos = 0, cs_bits = arr}
+
+
+:: Compress a :== a -> *CompressSt -> *CompressSt
+:: Uncompress a :== .CompressSt -> .(.(Maybe a), .CompressSt)
+
+compressBool :: !Bool !*CompressSt -> *CompressSt
+compressBool bit {cs_pos = pos, cs_bits = bits}
+	#! s = size bits
+	#! int_pos = pos >> (IF_INT_64_OR_32 6 5)
+	#! bit_pos = pos bitand (IF_INT_64_OR_32 63 31)
+	| s == int_pos
+		= abort "reallocate"
+		#! int = bits.[int_pos]
+		#! bit_mask = 1 << bit_pos
+		#! new_int = if bit (int bitor bit_mask) (int bitand (bitnot bit_mask))
+		= {cs_pos = inc pos, cs_bits = {bits & [int_pos] = new_int}}
+
+realToBinary32 :: !Real -> (!Int,!Int);
+realToBinary32 _ = code {
+	.d 0 1 r
+	pop_b 0 | don't do anything
+	.o 0 2 ii
+};
+// Alternatively, with inline code:
+realToBinary32 _ = code inline {
+	no_op
+};
+
+uncompressArray :: (u:CompressSt -> ((Maybe v:a),w:CompressSt)) -> .(x:CompressSt -> ((Maybe y:(b v:a)),z:CompressSt)) | Array b a, [x w <= u,y <= v,x w <= z]
+uncompressArray f
+	=	uncompressInt >>= \s -> uncompress_array 0 s (unsafeCreateArray s)
+where
+	uncompress_array i s arr
+		| i == s
+			= ret arr
+			= f >>= \x -> uncompress_array (inc i) s {arr & [i] = x}
+
+compressList :: (a *CompressSt -> *CompressSt) ![a] -> *CompressSt -> *CompressSt
+compressList c xs = compressArray c (list_to_arr xs)
+where
+	list_to_arr :: [b] -> {b} | Array {} b
+	list_to_arr xs = {x \\ x <- xs}
+
+generic gCompress a :: !a -> *CompressSt -> *CompressSt
+gCompress{|Int|} x = compressInt x
+gCompress{|EITHER|} cl cr (LEFT x) = cl x o compressBool False
+gCompress{|{}|} c xs = compressArray c xs
+gCompress{|{!}|} c xs = compressArray c xs
+gCompress{|[]|} c xs = compressList c xs
+
+
+generic gCompressedSize a :: a -> Int
+gCompressedSize{|Int|} _ = IF_INT_64_OR_32 64 32
+gCompressedSize{|PAIR|} cx cy (PAIR x y) = cx x + cy y
+gCompressedSize{|[]|} c xs = foldSt (\x st -> c x + st) xs (IF_INT_64_OR_32 64 32)
+gCompressedSize{|{}|} c xs = foldSt (\x st -> c x + st) [x\\x<-:xs] (IF_INT_64_OR_32 64 32)
+gCompressedSize{|{!}|} c xs = foldSt (\x st -> c x + st) [x\\x<-:xs] (IF_INT_64_OR_32 64 32)
+
+generic gUncompress a :: (u:CompressSt -> ((Maybe a),u:CompressSt))
+gUncompress{|PAIR|} fx fy = fx >>= \x -> fy >>= \y -> ret (PAIR x y)
+gUncompress{|CONS|} f = f >>= ret o CONS
+gUncompress{|FIELD|} f = f >>= \x -> ret $ FIELD x
+gUncompress{|OBJECT|} f = f >>= \x -> ret $ OBJECT x
+
+//-------------------------------------------------------------------------------------
+
+uncompress :: (BitVector -> Maybe a) | gUncompress{|*|} a
+uncompress = fst o gUncompress{|*|} o mkCompressSt
+
+compress :: !a -> BitVector | gCompressedSize{|*|} a & gCompress{|*|} a
+compress x
+	#! compressed_size = gCompressedSize{|*|} x
+	#! arr_size = (compressed_size + (IF_INT_64_OR_32 63 31)) >> (IF_INT_64_OR_32 6 5)
+	#! bits = createArray arr_size 0
+	= (gCompress{|*|} x (mkCompressSt bits)).cs_bits