Skip to content

Commit

Permalink
Test tree-sitter c++ parser (#1996)
Browse files Browse the repository at this point in the history
This will help #1952

test plan:
pad@yrax:~/work/lang-cpp/Cataclysm-DDA$ yy -lang cpp -test_parse_tree_sitter .
+ /home/pad/semgrep/_build/default/cli/Main.exe -lang cpp -test_parse_tree_sitter .
11 / 852/OVERLOAD/work/lang-cpp/Cataclysm-DDA/src/activity_actor.cpp: exn = Tree_sitter_run.Tree_sitter_error.Error(_)
...

NB total files = 852; NB total lines = 474254; perfect = 662; pbs = 190; timeout = 0; =========> 77%
nb good = 174878,  nb passed = 0 =========> 0.000000%
nb good = 174878,  nb bad = 299376 =========> 36.874333%

For comparison, the C++/C/cpp parser in pfff is doing:
NB total files = 852; NB total lines = 473336; perfect = 110; pbs = 742; timeout = 0; =========> 12%
nb good = 160267,  nb passed = 187 =========> 0.116680%
nb good = 160267,  nb bad = 313069 =========> 33.859035%
pad@yrax:~/work/lang-cpp/Cataclysm-DDA$
  • Loading branch information
aryx committed Nov 10, 2020
1 parent 37f79bb commit 1777183
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 51 deletions.
2 changes: 2 additions & 0 deletions semgrep-core/cli/Main.ml
Expand Up @@ -924,6 +924,8 @@ let all_actions () = [

"-test_parse_lang", " <files or dirs>",
Common.mk_action_n_arg (Test_parsing.test_parse_lang !lang get_final_files);
"-test_parse_tree_sitter", " <files or dirs>",
Common.mk_action_n_arg (Test_parsing.test_parse_tree_sitter !lang);
"-dump_tree_sitter_cst", " <file>",
Common.mk_action_1_arg Test_parsing.dump_tree_sitter_cst;
"-dump_ast_pfff", " <file>",
Expand Down
2 changes: 1 addition & 1 deletion semgrep-core/parsing/Check_semgrep.ml
Expand Up @@ -6,7 +6,7 @@ let lang_has_no_dollar_ids = Lang.(function
| Python | Python2 | Python3
| Java
| Go
| C
| C | Cplusplus
| OCaml
| JSON
| Csharp
Expand Down
150 changes: 112 additions & 38 deletions semgrep-core/parsing/Test_parsing.ml
Expand Up @@ -18,48 +18,53 @@ module G = AST_generic

let logger = Logging.get_logger [__MODULE__]

(*
TODO: Maybe instead of this we should print all the errors, print the CST
if there's one, only then fail (or succeed).
*)
let fail_on_error = function
(* less: could infer lang from filename *)
let dump_tree_sitter_cst_lang lang file =

(* TODO: Maybe instead of this we should print all the errors, print the CST
* if there's one, only then fail (or succeed).
*)
let fail_on_error = function
| Some cst, [] -> cst
| _, err :: _ -> raise (Tree_sitter_run.Tree_sitter_error.Error err)
| None, [] -> failwith "unknown error from tree-sitter parser"
in
match lang with
| Lang.Ruby ->
Tree_sitter_ruby.Parse.file file
|> fail_on_error
|> Tree_sitter_ruby.CST.dump_tree
| Lang.Java ->
Tree_sitter_java.Parse.file file
|> fail_on_error
|> Tree_sitter_java.CST.dump_tree
| Lang.Go ->
Tree_sitter_go.Parse.file file
|> fail_on_error
|> Tree_sitter_go.CST.dump_tree
| Lang.Csharp ->
Tree_sitter_csharp.Parse.file file
|> fail_on_error
|> Tree_sitter_csharp.CST.dump_tree
| Lang.Kotlin ->
Tree_sitter_kotlin.Parse.file file
|> fail_on_error
|> Tree_sitter_kotlin.CST.dump_tree
| Lang.Javascript ->
Tree_sitter_javascript.Parse.file file
|> fail_on_error
|> Tree_sitter_javascript.CST.dump_tree
| Lang.Typescript ->
Tree_sitter_typescript.Parse.file file
|> fail_on_error
|> Tree_sitter_typescript.CST.dump_tree

(* less: could infer lang from filename *)
let dump_tree_sitter_cst_lang lang file =
match lang with
| Lang.Ruby ->
Tree_sitter_ruby.Parse.file file
|> fail_on_error
|> Tree_sitter_ruby.CST.dump_tree
| Lang.Java ->
Tree_sitter_java.Parse.file file
|> fail_on_error
|> Tree_sitter_java.CST.dump_tree
| Lang.Go ->
Tree_sitter_go.Parse.file file
|> fail_on_error
|> Tree_sitter_go.CST.dump_tree
| Lang.Csharp ->
Tree_sitter_csharp.Parse.file file
|> fail_on_error
|> Tree_sitter_csharp.CST.dump_tree
| Lang.Kotlin ->
Tree_sitter_kotlin.Parse.file file
|> fail_on_error
|> Tree_sitter_kotlin.CST.dump_tree
| Lang.Javascript ->
Tree_sitter_javascript.Parse.file file
|> fail_on_error
|> Tree_sitter_javascript.CST.dump_tree
| Lang.Typescript ->
Tree_sitter_typescript.Parse.file file
|> fail_on_error
|> Tree_sitter_typescript.CST.dump_tree

| _ -> failwith "lang not supported by ocaml-tree-sitter"
| Lang.C ->
Tree_sitter_c.Parse.file file
|> fail_on_error
|> Tree_sitter_c.CST.dump_tree

| _ -> failwith "lang not supported by ocaml-tree-sitter"

let dump_tree_sitter_cst file =
match Lang.langs_of_filename file with
Expand Down Expand Up @@ -126,6 +131,75 @@ let test_parse_lang lang get_final_files xs =
Parse_info.print_parsing_stat_list !stat_list;
()


let test_parse_tree_sitter lang xs =
let lang =
match Lang.lang_of_string_opt lang with
| Some l -> l
| None -> failwith "no language or unsupported language; use correct -lang"
in
let xs = List.map Common.fullpath xs in
let fullxs = Lang.files_of_dirs_or_files lang xs
|> Skip_code.filter_files_if_skip_list ~root:xs
in
let fail_on_error = function
| Some cst, [] -> cst
| Some cst, xs when List.length xs <= 2 -> cst
| _, err :: _ -> raise (Tree_sitter_run.Tree_sitter_error.Error err)
| None, [] -> failwith "unknown error from tree-sitter parser"
in
let stat_list = ref [] in
fullxs |> Console.progress (fun k -> List.iter (fun file ->
k();
logger#info "processing %s" file;
let stat =
(try
(match lang with
(* less: factorize with dump_tree_sitter_cst_lang *)
| Lang.Ruby ->
Tree_sitter_ruby.Parse.file file
|> fail_on_error |> ignore
| Lang.Java ->
Tree_sitter_java.Parse.file file
|> fail_on_error |> ignore
| Lang.Go ->
Tree_sitter_go.Parse.file file
|> fail_on_error |> ignore
| Lang.Csharp ->
Tree_sitter_csharp.Parse.file file
|> fail_on_error |> ignore
| Lang.Kotlin ->
Tree_sitter_kotlin.Parse.file file
|> fail_on_error |> ignore
| Lang.Javascript ->
Tree_sitter_javascript.Parse.file file
|> fail_on_error |> ignore
| Lang.Typescript ->
Tree_sitter_typescript.Parse.file file
|> fail_on_error |> ignore
| Lang.C ->
Tree_sitter_c.Parse.file file
|> fail_on_error |> ignore
| Lang.Cplusplus ->
Tree_sitter_cpp.Parse.file file
|> fail_on_error |> ignore

| _ -> failwith (spf "lang %s not supported with tree-sitter"
(Lang.string_of_lang lang))
);
PI.correct_stat file
with exn ->
pr2 (spf "%s: exn = %s" file (Common.exn_to_s exn));
PI.bad_stat file
)
in
Common.push stat stat_list;
));
flush stdout; flush stderr;

Parse_info.print_parsing_stat_list !stat_list;
()

let diff_pfff_tree_sitter xs =
pr2 "NOTE: consider using -full_token_info to get also diff on tokens";
xs |> List.iter (fun file ->
Expand Down
1 change: 1 addition & 0 deletions semgrep-core/parsing/Test_parsing.mli
Expand Up @@ -2,6 +2,7 @@
val test_parse_lang: string ->
(Common.filename list -> Common.filename list) -> Common.filename list ->
unit
val test_parse_tree_sitter: string -> Common.filename list -> unit

val dump_tree_sitter_cst: Common.filename -> unit
val dump_ast_pfff: Common.filename -> unit
Expand Down
2 changes: 2 additions & 0 deletions semgrep-core/parsing/dune
Expand Up @@ -12,6 +12,8 @@
tree-sitter-lang.javascript
tree-sitter-lang.typescript
tree-sitter-lang.tsx
tree-sitter-lang.c
tree-sitter-lang.cpp

commons commons_core
pfff-config
Expand Down
2 changes: 1 addition & 1 deletion semgrep-core/pfff
Submodule pfff updated from f0b5b5 to d9b500
23 changes: 12 additions & 11 deletions semgrep-core/synthesizing/Pretty_print_generic.ml
Expand Up @@ -68,15 +68,16 @@ let print_bool env = function
(match env.lang with
| Lang.Python | Lang.Python2 | Lang.Python3
-> "True"
| Lang.Java | Lang.Go | Lang.C | Lang.JSON | Lang.Javascript
| Lang.Java | Lang.Go | Lang.C | Lang.Cplusplus
| Lang.JSON | Lang.Javascript
| Lang.OCaml | Lang.Ruby | Lang.Typescript
| Lang.Csharp | Lang.PHP | Lang.Kotlin
-> "true")
| false ->
(match env.lang with
| Lang.Python | Lang.Python2 | Lang.Python3
-> "False"
| Lang.Java | Lang.Go | Lang.C | Lang.JSON | Lang.Javascript
| Lang.Java | Lang.Go | Lang.C | Lang.Cplusplus | Lang.JSON | Lang.Javascript
| Lang.OCaml | Lang.Ruby | Lang.Typescript
| Lang.Csharp | Lang.PHP | Lang.Kotlin
-> "false")
Expand Down Expand Up @@ -167,7 +168,7 @@ and if_stmt env level (tok, e, s, sopt) =
let (format_cond, elseif_str, format_block) =
(match env.lang with
| Lang.Python | Lang.Python2 | Lang.Python3 -> (no_paren_cond, "elif", colon_body)
| Lang.Java | Lang.Go | Lang.C | Lang.Csharp
| Lang.Java | Lang.Go | Lang.C | Lang.Cplusplus | Lang.Csharp
| Lang.JSON | Lang.Javascript | Lang.Typescript
| Lang.Kotlin
-> (paren_cond, "else if", bracket_body)
Expand Down Expand Up @@ -197,7 +198,7 @@ and while_stmt env level (tok, e, s) =
let while_format =
(match env.lang with
| Lang.Python | Lang.Python2 | Lang.Python3 -> python_while
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
| Lang.JSON | Lang.Javascript | Lang.Typescript -> c_while
| Lang.Go -> go_while
| Lang.Ruby -> ruby_while
Expand All @@ -211,7 +212,7 @@ and do_while stmt env level (s, e) =
let c_do_while = F.sprintf "do %s\nwhile(%s)" in
let do_while_format =
(match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
| Lang.Javascript | Lang.Typescript -> c_do_while
| Lang.Python | Lang.Python2 | Lang.Python3
| Lang.Go | Lang.JSON | Lang.OCaml -> failwith "impossible; no do while"
Expand All @@ -224,7 +225,7 @@ and do_while stmt env level (s, e) =
and for_stmt env level (for_tok, hdr, s) =
let for_format =
(match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
| Lang.Javascript | Lang.Typescript -> F.sprintf "%s (%s) %s"
| Lang.Go -> F.sprintf "%s %s %s"
| Lang.Python | Lang.Python2 | Lang.Python3 -> F.sprintf "%s %s:\n%s"
Expand Down Expand Up @@ -258,7 +259,7 @@ and def_stmt env (entity, def_kind) =
let var_def (ent, def) =
let (no_val, with_val) =
(match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
-> (fun typ id _e -> F.sprintf "%s %s;" typ id),
(fun typ id e -> F.sprintf "%s %s = %s;" typ id e)
| Lang.Javascript | Lang.Typescript
Expand Down Expand Up @@ -294,7 +295,7 @@ and return env (tok, eopt) =
| Some e -> expr env e
in
match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
-> F.sprintf "%s %s;" (token "return" tok) to_return
| Lang.Python | Lang.Python2 | Lang.Python3
| Lang.Go | Lang.Ruby | Lang.OCaml
Expand All @@ -311,7 +312,7 @@ and break env (tok, lbl) =
| LDynamic e -> F.sprintf " %s" (expr env e)
in
match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
-> F.sprintf "%s%s;" (token "break" tok) lbl_str
| Lang.Python | Lang.Python2 | Lang.Python3
| Lang.Go | Lang.Ruby | Lang.OCaml
Expand All @@ -328,7 +329,7 @@ and continue env (tok, lbl) =
| LDynamic e -> F.sprintf " %s" (expr env e)
in
match env.lang with
| Lang.Java | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
-> F.sprintf "%s%s;" (token "continue" tok) lbl_str
| Lang.Python | Lang.Python2 | Lang.Python3
| Lang.Go | Lang.Ruby | Lang.OCaml
Expand Down Expand Up @@ -401,7 +402,7 @@ and literal env = function
(match env.lang with
| Lang.Python | Lang.Python2 | Lang.Python3 ->
"'" ^ s ^ "'"
| Lang.Java | Lang.Go | Lang.C | Lang.Csharp | Lang.Kotlin
| Lang.Java | Lang.Go | Lang.C | Lang.Cplusplus | Lang.Csharp | Lang.Kotlin
| Lang.JSON | Lang.Javascript
| Lang.OCaml | Lang.Ruby | Lang.Typescript ->
"\"" ^ s ^ "\""
Expand Down

0 comments on commit 1777183

Please sign in to comment.