Skip to content
This repository has been archived by the owner on Dec 19, 2023. It is now read-only.

Remove Unicode hack from Go/Java/JavaScript/Python parsers #502

Merged
merged 2 commits into from
Jan 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 3 additions & 31 deletions h_program-lang/Parse_info.ml
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,6 @@ let complete_token_location_large filename table x =
column = snd (table (x.charpos));
}

let unicode_hack_replacement_byte = 'Z'

(* Why is it better to first get all the tokens?
* Why not lex on-demand as yacc requires more tokens?
* There are a few reasons:
Expand All @@ -648,36 +646,10 @@ let unicode_hack_replacement_byte = 'Z'
* - we can have comments as tokens (useful for codemap/efuns) and
* skip them easily with one Common.exclude
*)
let tokenize_all_and_adjust_pos ?(unicode_hack=false)
file tokenizer visitor_tok is_eof =
let tokenize_all_and_adjust_pos file tokenizer visitor_tok is_eof =
Common.with_open_infile file (fun chan ->
let lexbuf =
if unicode_hack then
let string =
Common.profile_code "Unicode.input_and_replace_non_ascii" (fun () ->
(*
We replace all unicode characters by Zs as a hack to avoid
invalid locations due to assumptions that one byte = character.
This causes any non-ascii character to be represented by a
sequence of Zs, resulting in false positives such
as '"😀"' matching '"🚀"'.
See https://github.com/returntocorp/semgrep/issues/2111
TODO: get rid of this hack and fix location problems properly

This breaks java (and perhaps other) characters constants.
UTF-8 characters become something invalid like this:
char c = 'ZZZ'
There's a hack in the java lexer to support such invalid
character literals.
*)
Unicode.input_and_replace_non_ascii
~replacement_byte:unicode_hack_replacement_byte chan
) in
Lexing.from_string string
else
Lexing.from_channel chan
in
let table = full_charpos_to_pos_large file in
let lexbuf = Lexing.from_channel chan in
let table = full_charpos_to_pos_large file in
let adjust_info ii =
{ ii with token =
(* could assert pinfo.filename = file ? *)
Expand Down
9 changes: 0 additions & 9 deletions h_program-lang/Parse_info.mli
Original file line number Diff line number Diff line change
Expand Up @@ -235,17 +235,8 @@ val combine_infos: t -> t list -> t
* line, otherwise the line/col of the result might be wrong *)
val split_info_at_pos: int -> t -> t * t

(*
Any non-ascii byte is replaced by this ascii character as a hack to
work around our lack of support for Unicode-aware code locations.
This character is 'Z'.
Any sequence of one of more Zs may be the result of such replacement.
*)
val unicode_hack_replacement_byte : char

(* to be used by the lexer *)
val tokenize_all_and_adjust_pos:
?unicode_hack:bool ->
Common.filename ->
(Lexing.lexbuf -> 'tok) (* tokenizer *) ->
((t -> t) -> 'tok -> 'tok) (* token visitor *) ->
Expand Down
2 changes: 1 addition & 1 deletion lang_cpp/parsing/parse_cpp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ let is_same_line_or_close line tok =

(* called by parse below *)
let tokens file =
Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:false
Parse_info.tokenize_all_and_adjust_pos
file Lexer.token TH.visitor_info_of_tok TH.is_eof
[@@profiling]

Expand Down
76 changes: 71 additions & 5 deletions lang_go/parsing/lexer_go.mll
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,86 @@ let error = Parse_info.lexical_error

}

(*****************************************************************************)
(* UTF-8 boilerplate *)
(*****************************************************************************)
(*
Generic UTF-8 boilerplate.

See https://erratique.ch/software/uucp/doc/unicode.html
for a good explanation of how this works.

We don't convert UTF-8-encoded data to code points. We only do the minimum
to ensure the correct identification of the boundaries between scalar
code points.
*)

(* 0xxxxxxx *)
let ascii = ['\000'-'\127']

(* 110xxxxx *)
let utf8_head_byte2 = ['\192'-'\223']

(* 1110xxxx *)
let utf8_head_byte3 = ['\224'-'\239']

(* 11110xxx *)
let utf8_head_byte4 = ['\240'-'\247']

(* 10xxxxxx *)
let utf8_tail_byte = ['\128'-'\191']

(* 7 bits of payload *)
let utf8_1 = ascii

(* 11 bits of payload *)
let utf8_2 = utf8_head_byte2 utf8_tail_byte

(* 16 bits of payload *)
let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte

(* 21 bits of payload *)
let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte

(* Any UTF-8-encoded code point. This set includes more than it should
for simplicity.

- This includes encodings of the so-called surrogate code points
used by UTF-16 and not permitted by UTF-8.
- This includes the range 0x110000 to 0x1FFFFF which are beyond the
range of valid Unicode code points.
*)
let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4
let utf8_nonascii = utf8_2 | utf8_3 | utf8_4

(*****************************************************************************)
(* Regexp aliases *)
(*****************************************************************************)

let newline = ('\n' | "\r\n")
let whitespace = [' ' '\t']

(* todo: *)
(* TODO: unicode digits *)
let unicode_digit = ['0'-'9']

(* TODO: unicode letters *)
let unicode_letter = ['a'-'z' 'A'-'Z']
let unicode_char = [^ '\n' '\r']
let unicode_char_no_quote = [^ '\n' '\r' '\'' '\\']
let unicode_char_no_double_quote = [^ '\n' '\r' '"' '\\']
let unicode_char_no_backquote = [^ '\n' '\r' '`' ]

let unicode_char =
ascii # ['\n' '\r']
| utf8_nonascii

let unicode_char_no_quote =
ascii # ['\n' '\r' '\'' '\\']
| utf8_nonascii

let unicode_char_no_double_quote =
ascii # ['\n' '\r' '"' '\\']
| utf8_nonascii

let unicode_char_no_backquote =
ascii # ['\n' '\r' '`' ]
| utf8_nonascii

let letter = unicode_letter | '_'

Expand Down
2 changes: 1 addition & 1 deletion lang_go/parsing/parse_go.ml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ let tokens2 file =
let token lexbuf =
Lexer.token lexbuf
in
Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
Parse_info.tokenize_all_and_adjust_pos
file token TH.visitor_info_of_tok TH.is_eof

let tokens a =
Expand Down
6 changes: 1 addition & 5 deletions lang_go/parsing/unit_parsing_go.ml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ let tests =
let dir = Config_pfff.tests_path "go/parsing" in
let files = Common2.glob (spf "%s/*.go" dir)in
files |> List.iter (fun file ->
try
let _ = Parse_go.parse_program file in
()
with Parse_info.Parsing_error _ ->
Alcotest.failf "it should correctly parse %s" file
Testutil.run file (fun () -> Parse_go.parse_program file |> ignore)
)
);
]
53 changes: 50 additions & 3 deletions lang_java/parsing/lexer_java.mll
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,59 @@ let EscapeSequence =
let EscapeSequence_semgrep =
'\\' _

(************************ UTF-8 boilerplate ************************)
(*
Generic UTF-8 boilerplate.

(* ugly: see unicode_hack in Parse_info.ml *)
let UnicodeX = "Z"+
See https://erratique.ch/software/uucp/doc/unicode.html
for a good explanation of how this works.

We don't convert UTF-8-encoded data to code points. We only do the minimum
to ensure the correct identification of the boundaries between scalar
code points.
*)

(* 0xxxxxxx *)
let ascii = ['\000'-'\127']

(* 110xxxxx *)
let utf8_head_byte2 = ['\192'-'\223']

(* 1110xxxx *)
let utf8_head_byte3 = ['\224'-'\239']

(* 11110xxx *)
let utf8_head_byte4 = ['\240'-'\247']

(* 10xxxxxx *)
let utf8_tail_byte = ['\128'-'\191']

(* 7 bits of payload *)
let utf8_1 = ascii

(* 11 bits of payload *)
let utf8_2 = utf8_head_byte2 utf8_tail_byte

(* 16 bits of payload *)
let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte

(* 21 bits of payload *)
let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte

(* Any UTF-8-encoded code point. This set includes more than it should
for simplicity.

- This includes encodings of the so-called surrogate code points
used by UTF-16 and not permitted by UTF-8.
- This includes the range 0x110000 to 0x1FFFFF which are beyond the
range of valid Unicode code points.
*)
let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4

(************************ end of UTF-8 boilerplate ************************)

let SingleCharacter = [^ '\'' '\\' '\n' '\r']
let CharacterLiteral = '\'' (SingleCharacter | EscapeSequence | UnicodeX ) '\''
let CharacterLiteral = '\'' (SingleCharacter | EscapeSequence | utf8 ) '\''


let StringCharacter = [^ '"' '\\' '\n' '\r']
Expand Down
2 changes: 1 addition & 1 deletion lang_java/parsing/parse_java.ml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ let error_msg_tok tok =

let tokens2 file =
let token = Lexer_java.token in
Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
Parse_info.tokenize_all_and_adjust_pos
file token TH.visitor_info_of_tok TH.is_eof
let tokens a =
Common.profile_code "Java parsing.tokens" (fun () -> tokens2 a)
Expand Down
6 changes: 1 addition & 5 deletions lang_java/parsing/unit_parsing_java.ml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ let tests =
let dir = Config_pfff.tests_path "java/parsing" in
let files = Common2.glob (spf "%s/*.java" dir) in
files |> List.iter (fun file ->
try
let _ = Parse_java.parse file in
()
with Parse_info.Parsing_error _ ->
Alcotest.failf "it should correctly parse %s" file
Testutil.run file (fun () -> Parse_java.parse file |> ignore)
)
);

Expand Down
2 changes: 0 additions & 2 deletions lang_js/parsing/lexer_js.mll
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,6 @@ rule initial = parse
* The right solution would be to switch to a unicode-aware lexer generator,
* like ulex or sedlex.
* todo: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
* update: with Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
* the hack below is redundant.
*)
| "\xc2\xa0" (* non-breaking-space \u{00A0} *)
| "\xef\xbb\xbf" (* byte-order-mark \u{FEFF} *)
Expand Down
2 changes: 1 addition & 1 deletion lang_js/parsing/parse_js.ml
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ let tokens file =
then Lexer_js._last_non_whitespace_like_token := Some tok;
tok
in
Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
Parse_info.tokenize_all_and_adjust_pos
file token TH.visitor_info_of_tok TH.is_eof
[@@profiling]

Expand Down
71 changes: 69 additions & 2 deletions lang_python/parsing/Lexer_python.mll
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,76 @@ let escapeseq = '\\' _
(* for raw fstring *)
let escapeseq2 = '\\' [^ '{']

let identifier = ['a'-'z' 'A'-'Z' '_'] ['a'-'z' 'A'-'Z' '0'-'9' '_']*
(************************ UTF-8 boilerplate ************************)
(*
Generic UTF-8 boilerplate.

let nonidchar = [^ 'a'-'z' 'A'-'Z' '0'-'9' '_']
See https://erratique.ch/software/uucp/doc/unicode.html
for a good explanation of how this works.

We don't convert UTF-8-encoded data to code points. We only do the minimum
to ensure the correct identification of the boundaries between scalar
code points.
*)

(* 0xxxxxxx *)
let ascii = ['\000'-'\127']

(* 110xxxxx *)
let utf8_head_byte2 = ['\192'-'\223']

(* 1110xxxx *)
let utf8_head_byte3 = ['\224'-'\239']

(* 11110xxx *)
let utf8_head_byte4 = ['\240'-'\247']

(* 10xxxxxx *)
let utf8_tail_byte = ['\128'-'\191']

(* 7 bits of payload *)
let utf8_1 = ascii

(* 11 bits of payload *)
let utf8_2 = utf8_head_byte2 utf8_tail_byte

(* 16 bits of payload *)
let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte

(* 21 bits of payload *)
let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte

(* Any UTF-8-encoded code point. This set includes more than it should
for simplicity.

- This includes encodings of the so-called surrogate code points
used by UTF-16 and not permitted by UTF-8.
- This includes the range 0x110000 to 0x1FFFFF which are beyond the
range of valid Unicode code points.
*)
let utf8 = utf8_1 | utf8_2 | utf8_3 | utf8_4

let utf8_nonascii = utf8_2 | utf8_3 | utf8_4

(************************ end of UTF-8 boilerplate ************************)

(*
https://www.python.org/dev/peps/pep-3131/ says:
The identifier syntax is <XID_Start> <XID_Continue>*.

TODO: use the correct character set for nonascii identifiers
For now, we don't have an implementation of the Unicode character classes
XID_Start and XID_Continue. We incorrectly assume that any nonascii
code point is valid as part of an identifier. This should be fine
as long as non-ascii characters aren't used for anything else than
identifiers and quoted strings.
*)
let identifier =
(* keeping the all-ascii case separate hoping it's faster this way *)
['a'-'z' 'A'-'Z' '_']['a'-'z' 'A'-'Z' '0'-'9' '_']*

| (['a'-'z' 'A'-'Z' '_'] | utf8_nonascii)
(['a'-'z' 'A'-'Z' '0'-'9' '_'] | utf8_nonascii)*

(*****************************************************************************)
(* Rule initial *)
Expand Down
2 changes: 1 addition & 1 deletion lang_python/parsing/Parse_python.ml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ let tokens parsing_mode file =
Parse_info.lexical_error s lexbuf;
T.EOF (Parse_info.tokinfo lexbuf)
in
Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:true
Parse_info.tokenize_all_and_adjust_pos
file token TH.visitor_info_of_tok TH.is_eof
[@@profiling]

Expand Down
6 changes: 1 addition & 5 deletions lang_python/parsing/Unit_parsing_python.ml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ let tests =
let dir = Config_pfff.tests_path "python/parsing" in
let files = Common2.glob (spf "%s/*.py" dir)in
files |> List.iter (fun file ->
try
let _ = Parse_python.parse_program file in
()
with Parse_info.Parsing_error _ ->
Alcotest.failf "it should correctly parse %s" file
Testutil.run file (fun () -> Parse_python.parse_program file |> ignore)
)
);
]