semgrep · aryx · Jan 14, 2022 · Jan 13, 2022 · Jan 13, 2022
diff --git a/h_program-lang/Parse_info.ml b/h_program-lang/Parse_info.ml
@@ -637,8 +637,6 @@ let complete_token_location_large filename table x =
     column = snd (table (x.charpos));
   }
 
-let unicode_hack_replacement_byte = 'Z'
-
 (* Why is it better to first get all the tokens?
  * Why not lex on-demand as yacc requires more tokens?
  * There are a few reasons:
@@ -648,36 +646,10 @@ let unicode_hack_replacement_byte = 'Z'
  *  - we can have comments as tokens (useful for codemap/efuns) and
  *    skip them easily with one Common.exclude
 *)
-let tokenize_all_and_adjust_pos ?(unicode_hack=false)
-    file tokenizer visitor_tok is_eof =
+let tokenize_all_and_adjust_pos file tokenizer visitor_tok is_eof =
   Common.with_open_infile file (fun chan ->
-    let lexbuf =
-      if unicode_hack then
-        let string =
-          Common.profile_code "Unicode.input_and_replace_non_ascii" (fun () ->
-          (*
-             We replace all unicode characters by Zs as a hack to avoid
-             invalid locations due to assumptions that one byte = character.
-             This causes any non-ascii character to be represented by a
-             sequence of Zs, resulting in false positives such
-             as '"😀"' matching '"🚀"'.
-             See https://github.com/returntocorp/semgrep/issues/2111
-             TODO: get rid of this hack and fix location problems properly
-
-             This breaks java (and perhaps other) characters constants.
-             UTF-8 characters become something invalid like this:
-               char c = 'ZZZ'
-             There's a hack in the java lexer to support such invalid
-             character literals.
-          *)
-            Unicode.input_and_replace_non_ascii
-              ~replacement_byte:unicode_hack_replacement_byte chan
-          ) in
-        Lexing.from_string string
-      else
-        Lexing.from_channel chan
-    in
-    let table     = full_charpos_to_pos_large file in
+    let lexbuf = Lexing.from_channel chan in
+    let table = full_charpos_to_pos_large file in
     let adjust_info ii =
       { ii with token =
                   (* could assert pinfo.filename = file ? *)

diff --git a/h_program-lang/Parse_info.mli b/h_program-lang/Parse_info.mli
@@ -235,17 +235,8 @@ val combine_infos: t -> t list -> t
  * line, otherwise the line/col of the result might be wrong *)
 val split_info_at_pos: int -> t -> t * t
 
-(*
-   Any non-ascii byte is replaced by this ascii character as a hack to
-   work around our lack of support for Unicode-aware code locations.
-   This character is 'Z'.
-   Any sequence of one of more Zs may be the result of such replacement.
-*)
-val unicode_hack_replacement_byte : char
-
 (* to be used by the lexer *)
 val tokenize_all_and_adjust_pos:
-  ?unicode_hack:bool ->
   Common.filename ->
   (Lexing.lexbuf -> 'tok) (* tokenizer *) ->
   ((t -> t) -> 'tok -> 'tok) (* token visitor *) ->

diff --git a/lang_cpp/parsing/parse_cpp.ml b/lang_cpp/parsing/parse_cpp.ml
@@ -109,7 +109,7 @@ let is_same_line_or_close line tok =
 
 (* called by parse below *)
 let tokens file =
-  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:false
+  Parse_info.tokenize_all_and_adjust_pos
     file Lexer.token TH.visitor_info_of_tok TH.is_eof
 [@@profiling]
 

diff --git a/lang_go/parsing/lexer_go.mll b/lang_go/parsing/lexer_go.mll
@@ -40,20 +40,86 @@ let error = Parse_info.lexical_error
 
 }
 
+(*****************************************************************************)
+(* UTF-8 boilerplate *)
+(*****************************************************************************)
+(*
+   Generic UTF-8 boilerplate.
+
+   See https://erratique.ch/software/uucp/doc/unicode.html
+   for a good explanation of how this works.
+
+   We don't convert UTF-8-encoded data to code points. We only do the minimum
+   to ensure the correct identification of the boundaries between scalar
+   code points.
+*)
+
+(* 0xxxxxxx *)
+let ascii = ['\000'-'\127']
+
+(* 110xxxxx *)
+let utf8_head_byte2 = ['\192'-'\223']
+
+(* 1110xxxx *)
+let utf8_head_byte3 = ['\224'-'\239']
+
+(* 11110xxx *)
+let utf8_head_byte4 = ['\240'-'\247']
+
+(* 10xxxxxx *)
+let utf8_tail_byte = ['\128'-'\191']
+
+(* 7 bits of payload *)
+let utf8_1 = ascii
+
+(* 11 bits of payload *)
+let utf8_2 = utf8_head_byte2 utf8_tail_byte
+
+(* 16 bits of payload *)
+let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
+
+(* 21 bits of payload *)
+let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
+
+(* Any UTF-8-encoded code point. This set includes more than it should
+   for simplicity.
+
+   - This includes encodings of the so-called surrogate code points
+     used by UTF-16 and not permitted by UTF-8.
+   - This includes the range 0x110000 to 0x1FFFFF which are beyond the
+     range of valid Unicode code points.
+*)
+let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4
+let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
+
 (*****************************************************************************)
 (* Regexp aliases *)
 (*****************************************************************************)
 
 let newline = ('\n' | "\r\n")
 let whitespace = [' ' '\t']
 
-(* todo: *)
+(* TODO: unicode digits *)
 let unicode_digit = ['0'-'9']
+
+(* TODO: unicode letters *)
 let unicode_letter = ['a'-'z' 'A'-'Z']
-let unicode_char = [^ '\n' '\r']
-let unicode_char_no_quote = [^ '\n' '\r' '\'' '\\']
-let unicode_char_no_double_quote = [^ '\n' '\r' '"' '\\']
-let unicode_char_no_backquote = [^ '\n' '\r' '`' ]
+
+let unicode_char =
+  ascii # ['\n' '\r']
+| utf8_nonascii
+
+let unicode_char_no_quote =
+  ascii # ['\n' '\r' '\'' '\\']
+| utf8_nonascii
+
+let unicode_char_no_double_quote =
+  ascii # ['\n' '\r' '"' '\\']
+| utf8_nonascii
+
+let unicode_char_no_backquote =
+  ascii # ['\n' '\r' '`' ]
+| utf8_nonascii
 
 let letter = unicode_letter | '_'
 

diff --git a/lang_go/parsing/parse_go.ml b/lang_go/parsing/parse_go.ml
@@ -43,7 +43,7 @@ let tokens2 file =
   let token lexbuf =
     Lexer.token lexbuf
   in
-  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
+  Parse_info.tokenize_all_and_adjust_pos
     file token TH.visitor_info_of_tok TH.is_eof
 
 let tokens a =

diff --git a/lang_go/parsing/unit_parsing_go.ml b/lang_go/parsing/unit_parsing_go.ml
@@ -11,11 +11,7 @@ let tests =
       let dir = Config_pfff.tests_path "go/parsing" in
       let files = Common2.glob (spf "%s/*.go" dir)in
       files |> List.iter (fun file ->
-        try
-          let _ = Parse_go.parse_program file in
-          ()
-        with Parse_info.Parsing_error _ ->
-          Alcotest.failf "it should correctly parse %s" file
+        Testutil.run file (fun () -> Parse_go.parse_program file |> ignore)
       )
     );
   ]
diff --git a/lang_java/parsing/lexer_java.mll b/lang_java/parsing/lexer_java.mll
@@ -216,12 +216,59 @@ let EscapeSequence =
 let EscapeSequence_semgrep =
   '\\' _
 
+(************************ UTF-8 boilerplate ************************)
+(*
+   Generic UTF-8 boilerplate.
 
-(* ugly: see unicode_hack in Parse_info.ml *)
-let UnicodeX = "Z"+
+   See https://erratique.ch/software/uucp/doc/unicode.html
+   for a good explanation of how this works.
+
+   We don't convert UTF-8-encoded data to code points. We only do the minimum
+   to ensure the correct identification of the boundaries between scalar
+   code points.
+*)
+
+(* 0xxxxxxx *)
+let ascii = ['\000'-'\127']
+
+(* 110xxxxx *)
+let utf8_head_byte2 = ['\192'-'\223']
+
+(* 1110xxxx *)
+let utf8_head_byte3 = ['\224'-'\239']
+
+(* 11110xxx *)
+let utf8_head_byte4 = ['\240'-'\247']
+
+(* 10xxxxxx *)
+let utf8_tail_byte = ['\128'-'\191']
+
+(* 7 bits of payload *)
+let utf8_1 = ascii
+
+(* 11 bits of payload *)
+let utf8_2 = utf8_head_byte2 utf8_tail_byte
+
+(* 16 bits of payload *)
+let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
+
+(* 21 bits of payload *)
+let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
+
+(* Any UTF-8-encoded code point. This set includes more than it should
+   for simplicity.
+
+   - This includes encodings of the so-called surrogate code points
+     used by UTF-16 and not permitted by UTF-8.
+   - This includes the range 0x110000 to 0x1FFFFF which are beyond the
+     range of valid Unicode code points.
+*)
+let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4
+
+(************************ end of UTF-8 boilerplate ************************)
 
 let SingleCharacter = [^ '\'' '\\' '\n' '\r']
-let CharacterLiteral = '\'' (SingleCharacter | EscapeSequence | UnicodeX ) '\''
+let CharacterLiteral = '\'' (SingleCharacter | EscapeSequence | utf8 ) '\''
 
 
 let StringCharacter = [^ '"' '\\' '\n' '\r']

diff --git a/lang_java/parsing/parse_java.ml b/lang_java/parsing/parse_java.ml
@@ -38,7 +38,7 @@ let error_msg_tok tok =
 
 let tokens2 file =
   let token = Lexer_java.token in
-  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
+  Parse_info.tokenize_all_and_adjust_pos
     file token TH.visitor_info_of_tok TH.is_eof
 let tokens a =
   Common.profile_code "Java parsing.tokens" (fun () -> tokens2 a)

diff --git a/lang_java/parsing/unit_parsing_java.ml b/lang_java/parsing/unit_parsing_java.ml
@@ -15,11 +15,7 @@ let tests =
       let dir = Config_pfff.tests_path "java/parsing" in
       let files = Common2.glob (spf "%s/*.java" dir) in
       files |> List.iter (fun file ->
-        try
-          let _ = Parse_java.parse file in
-          ()
-        with Parse_info.Parsing_error _ ->
-          Alcotest.failf "it should correctly parse %s" file
+        Testutil.run file (fun () -> Parse_java.parse file |> ignore)
       )
     );
 

diff --git a/lang_js/parsing/lexer_js.mll b/lang_js/parsing/lexer_js.mll
@@ -273,8 +273,6 @@ rule initial = parse
    * The right solution would be to switch to a unicode-aware lexer generator,
    * like ulex or sedlex.
    * todo: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
-   * update: with Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
-   * the hack below is redundant.
    *)
   | "\xc2\xa0" (* non-breaking-space \u{00A0} *)
   | "\xef\xbb\xbf" (* byte-order-mark \u{FEFF} *)

diff --git a/lang_js/parsing/parse_js.ml b/lang_js/parsing/parse_js.ml
@@ -182,7 +182,7 @@ let tokens file =
     then Lexer_js._last_non_whitespace_like_token := Some tok;
     tok
   in
-  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
+  Parse_info.tokenize_all_and_adjust_pos
     file token TH.visitor_info_of_tok TH.is_eof
 [@@profiling]
 

diff --git a/lang_python/parsing/Lexer_python.mll b/lang_python/parsing/Lexer_python.mll
@@ -205,9 +205,76 @@ let escapeseq = '\\' _
 (* for raw fstring *)
 let escapeseq2 = '\\' [^ '{']
 
-let identifier = ['a'-'z' 'A'-'Z' '_'] ['a'-'z' 'A'-'Z' '0'-'9' '_']*
+(************************ UTF-8 boilerplate ************************)
+(*
+   Generic UTF-8 boilerplate.
 
-let nonidchar = [^ 'a'-'z' 'A'-'Z' '0'-'9' '_']
+   See https://erratique.ch/software/uucp/doc/unicode.html
+   for a good explanation of how this works.
+
+   We don't convert UTF-8-encoded data to code points. We only do the minimum
+   to ensure the correct identification of the boundaries between scalar
+   code points.
+*)
+
+(* 0xxxxxxx *)
+let ascii = ['\000'-'\127']
+
+(* 110xxxxx *)
+let utf8_head_byte2 = ['\192'-'\223']
+
+(* 1110xxxx *)
+let utf8_head_byte3 = ['\224'-'\239']
+
+(* 11110xxx *)
+let utf8_head_byte4 = ['\240'-'\247']
+
+(* 10xxxxxx *)
+let utf8_tail_byte = ['\128'-'\191']
+
+(* 7 bits of payload *)
+let utf8_1 = ascii
+
+(* 11 bits of payload *)
+let utf8_2 = utf8_head_byte2 utf8_tail_byte
+
+(* 16 bits of payload *)
+let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
+
+(* 21 bits of payload *)
+let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
+
+(* Any UTF-8-encoded code point. This set includes more than it should
+   for simplicity.
+
+   - This includes encodings of the so-called surrogate code points
+     used by UTF-16 and not permitted by UTF-8.
+   - This includes the range 0x110000 to 0x1FFFFF which are beyond the
+     range of valid Unicode code points.
+*)
+let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4
+
+let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
+
+(************************ end of UTF-8 boilerplate ************************)
+
+(*
+   https://www.python.org/dev/peps/pep-3131/ says:
+   The identifier syntax is <XID_Start> <XID_Continue>*.
+
+   TODO: use the correct character set for nonascii identifiers
+   For now, we don't have an implementation of the Unicode character classes
+   XID_Start and XID_Continue. We incorrectly assume that any nonascii
+   code point is valid as part of an identifier. This should be fine
+   as long as non-ascii characters aren't used for anything else than
+   identifiers and quoted strings.
+*)
+let identifier =
+  (* keeping the all-ascii case separate hoping it's faster this way *)
+    ['a'-'z' 'A'-'Z' '_']['a'-'z' 'A'-'Z' '0'-'9' '_']*
+
+  | (['a'-'z' 'A'-'Z' '_'] | utf8_nonascii)
+    (['a'-'z' 'A'-'Z' '0'-'9' '_'] | utf8_nonascii)*
 
 (*****************************************************************************)
 (* Rule initial *)

diff --git a/lang_python/parsing/Parse_python.ml b/lang_python/parsing/Parse_python.ml
@@ -90,7 +90,7 @@ let tokens parsing_mode file =
         Parse_info.lexical_error s lexbuf;
         T.EOF (Parse_info.tokinfo lexbuf)
   in
-  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
+  Parse_info.tokenize_all_and_adjust_pos
     file token TH.visitor_info_of_tok TH.is_eof
 [@@profiling]
 

diff --git a/lang_python/parsing/Unit_parsing_python.ml b/lang_python/parsing/Unit_parsing_python.ml
@@ -11,11 +11,7 @@ let tests =
       let dir = Config_pfff.tests_path "python/parsing" in
       let files = Common2.glob (spf "%s/*.py" dir)in
       files |> List.iter (fun file ->
-        try
-          let _ = Parse_python.parse_program file in
-          ()
-        with Parse_info.Parsing_error _ ->
-          Alcotest.failf "it should correctly parse %s" file
+        Testutil.run file (fun () -> Parse_python.parse_program file |> ignore)
       )
     );
   ]