Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize deep statement matching #852

Merged
merged 5 commits into from May 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
54 changes: 54 additions & 0 deletions docs/development.md
Expand Up @@ -97,6 +97,60 @@ Set the OCAMLRUNPARAM environment variable to 'b' for backtrace. You will get be
export OCAMLRUNPARAM=b
```

## Profiling code

You can pass the -profile command-line argument to semgrep-core to get
a short profile of the code, for example:
``` bash
cd semgrep_core
./_build/default/bin/Main.exe -profile -e foo tests/python
---------------------
profiling result
---------------------
Main total : 1.975 sec 1 count
Parse_python.parse : 0.828 sec 1 count
...
```

You can also instead set the environment variable SEMGREP_CORE_PROFILE to 1 to get the same information:

``` bash
cd semgrep_core
export SEMGREP_CORE_PROFILE=1
./_build/default/bin/Main.exe -e foo tests/python
---------------------
profiling result
---------------------
Main total : 1.975 sec 1 count
Parse_python.parse : 0.828 sec 1 count
...
```

This is especially useful when you don't call directly semgrep-core, but
instead use the python wrapper semgrep.

You can also use the SEMGREP_CORE_DEBUG environment variable to add debugging
information, for example:
```bash
export SEMGREP_CORE_DEBUG=1
export SEMGREP_CORE_PROFILE=1
pipenv run semgrep -f ../semgrep-core/tests/PERF/ajin.yaml ../semgrep-core/tests/PERF/three.js
Debug mode On
Executed as: semgrep-core -lang javascript -rules_file /tmp/tmpy5pzp3p_ -j 8 ../semgrep-core/tests/PERF/three.js
Profile mode On
disabling -j when in profiling mode
PARSING: ../semgrep-core/tests/PERF/three.js
saving rules file for debugging in: /tmp/semgrep_core_rule-97ae74.yaml
---------------------
profiling result
---------------------
Main total : 1.975 sec 1 count
Parse_js.parse : 0.828 sec 1 count
Semgrep.check : 0.791 sec 1 count
Semgrep.match_sts_sts : 0.559 sec 185064 count
...
```

## Testing

### semgrep-core
Expand Down
58 changes: 50 additions & 8 deletions semgrep-core/bin/Main.ml
Expand Up @@ -48,12 +48,20 @@ module J = Json_type
(* Flags *)
(*****************************************************************************)

(* You can set those environment variables to enable debugging/profiling
* instead of using -debug or -profile. This is useful when you don't call
* directly semgrep-core but instead use the semgrep Python wrapper.
*)
let env_debug = "SEMGREP_CORE_DEBUG"
let env_profile = "SEMGREP_CORE_PROFILE"

(*s: constant [[Main_semgrep_core.verbose]] *)
let verbose = ref false
(*e: constant [[Main_semgrep_core.verbose]] *)
(*s: constant [[Main_semgrep_core.debug]] *)
let debug = ref false
(*e: constant [[Main_semgrep_core.debug]] *)
let profile = ref false
(*s: constant [[Main_semgrep_core.error_recovery]] *)
(* try to continue processing files, even if one has a parse error with -e/f.
* note that -rules_file does its own error recovery.
Expand Down Expand Up @@ -420,6 +428,7 @@ let iter_generic_ast_of_files_and_get_matches_and_exn_to_errors f files =
| [] ->
failwith (spf "can not extract generic AST from %s" file)
in
if !debug then pr2 (spf "PARSING: %s" file);
let ast = parse_generic lang file in

(* calling the hook *)
Expand Down Expand Up @@ -540,6 +549,16 @@ let sgrep_with_rules rules_file xs =
print_matches_and_errors files matches errs
(*e: function [[Main_semgrep_core.sgrep_with_rules]] *)

(* when called from semgrep-python, error messages in semgrep-core or
* certain profiling statistics may refer to rule id that are generated
* by semgrep-python, making it hard to know what the problem is.
* At least we can save this generated rule file to help debugging.
*)
let save_rules_file_in_tmp () =
let tmp = Filename.temp_file "semgrep_core_rule-" ".yaml" in
pr2 (spf "saving rules file for debugging in: %s" tmp);
Common.write_file ~file:tmp (Common.read_file !rules_file)

(*****************************************************************************)
(* Semgrep -tainting_rules_file *)
(*****************************************************************************)
Expand Down Expand Up @@ -786,7 +805,15 @@ let options () =
(*s: [[Main_semgrep_core.options]] concatenated flags *)
Flag_parsing_cpp.cmdline_flags_macrofile () @
(*x: [[Main_semgrep_core.options]] concatenated flags *)
Common2.cmdline_flags_devel () @
(* inlining of: Common2.cmdline_flags_devel () @ *)
[ "-debugger", Arg.Set Common.debugger,
" option to set if launched inside ocamldebug";
"-profile", Arg.Unit (fun () ->
Common.profile := Common.ProfAll;
profile := true;
),
" output profiling information";
] @
(*x: [[Main_semgrep_core.options]] concatenated flags *)
Meta_parse_info.cmdline_flags_precision () @
(*x: [[Main_semgrep_core.options]] concatenated flags *)
Expand Down Expand Up @@ -843,8 +870,25 @@ let main () =
spf "Usage: %s [options] <pattern> <files_or_dirs> \nOptions:"
(Filename.basename Sys.argv.(0))
in

let argv =
(Array.to_list Sys.argv) @
(if Sys.getenv_opt "SEMGREP_CORE_DEBUG" <> None then ["-debug"] else[])@
(if Sys.getenv_opt "SEMGREP_CORE_PROFILE" <> None then ["-profile"] else[])
in

(* does side effect on many global flags *)
let args = Common.parse_options (options()) usage_msg Sys.argv in
let args = Common.parse_options (options()) usage_msg (Array.of_list argv) in

if !debug then begin
pr2 "Debug mode On";
pr2 (spf "Executed as: %s" (Sys.argv|>Array.to_list|> String.concat " "));
end;
if !profile then begin
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rcoh maybe this was the issue. Maybe you were running the ocaml programs with profiling information but
because of -j the job was actually done in another process ...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that occurred to me after I read that multi threading in OCaml is actually multiprocessing

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, OCaml has concurrent threads (Xavier Leroy the author of OCaml actually added the first POSIX C thead library for Linux a long time ago, and he did it because he wanted threads in OCaml), but it does not have yet multi-core threads. There is work ongoing to suppor that.
Note that neither Python/PHP/Ruby/... have multi-core threads either.

pr2 "Profile mode On";
pr2 "disabling -j when in profiling mode";
ncores := 1;
end;

(* must be done after Arg.parse, because Common.profile is set by it *)
Common.profile_code "Main total" (fun () ->
Expand All @@ -868,13 +912,11 @@ let main () =
(match () with
(*s: [[Main_semgrep_core.main()]] main entry match cases *)
| _ when !rules_file <> "" ->
(try sgrep_with_rules !rules_file (x::xs)
(try
sgrep_with_rules !rules_file (x::xs);
if !profile then save_rules_file_in_tmp ();
with exn -> begin
if Sys.getenv_opt "SEMGREP_CORE_DEBUG" <> None then begin
let tmp = Filename.temp_file "semgrep_core_rule-" ".yaml" in
pr2 (spf "saving rule file leading to the error in: %s" tmp);
Common.write_file ~file:tmp (Common.read_file !rules_file);
end;
if !debug then save_rules_file_in_tmp ();
pr (format_output_exception exn);
exit 2
end
Expand Down
75 changes: 68 additions & 7 deletions semgrep-core/matching/Generic_vs_generic.ml
Expand Up @@ -1259,13 +1259,74 @@ and m_other_attribute_operator = m_other_xxx

(*s: function [[Generic_vs_generic.m_stmts_deep]] *)
and m_stmts_deep (xsa: A.stmt list) (xsb: A.stmt list) =
if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa)
then
m_list__m_stmt xsa xsb >!> (fun () ->
let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in
m_list__m_stmt xsa xsb'
)
else m_list__m_stmt xsa xsb
(* opti: this was the old code:
* if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa)
* then
* m_list__m_stmt xsa xsb >!> (fun () ->
* let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in
* m_list__m_stmt xsa xsb'
* )
* else m_list__m_stmt xsa xsb
*
* but this was really slow on huge files because with a pattern like
* 'foo(); ...; bar();' we would call flatten_substmts_of_stmts
* on each sequences in the program, even though foo(); was not
* matched first.
* Better to first match the first element, and if it matches and
* we have a '...' that was not matched on the current sequence,
* then we try with flatten_substmts_of_stmts.
*
* The code below is mostly a copy paste of m_list__m_stmt. We could
* factorize, but I prefer to control and limit the number of places
* where we call m_stmts_deep. Once we call m_list__m_stmt, we
* are in a simpler world where the list of stmts will not grow.
*)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for explaining the context and the intent

match xsa, xsb with
| [], [] ->
return ()
(*s: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *)
(* less-is-ok:
* it's ok to have statements after in the concrete code as long as we
* matched all the statements in the pattern (there is an implicit
* '...' at the end, in addition to implicit '...' at the beginning
* handled by kstmts calling the pattern for each subsequences).
* TODO: sgrep_generic though then display the whole sequence as a match
* instead of just the relevant part.
*)
| [], _::_ ->
return ()
(*e: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *)
Comment on lines +1285 to +1298
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the motivation to separate these two cases for documentation?

(vs. | [], _ ->)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just more precise. there is already a case above for [], [], so [], _ below would be more general that it needs to be.


(* dots: '...', can also match no statement *)
| [A.ExprStmt (A.Ellipsis _i)], [] ->
return ()

| (A.ExprStmt (A.Ellipsis i))::xsa, xb::xsb ->
(* let's first try the without going deep *)
(
(* can match nothing *)
(m_list__m_stmt xsa (xb::xsb)) >||>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how does one get documentation on >||>?

>>= seems common enough that it's nicer than Monad.bind, but I'm struggling to grok >||> and >!>. Maybe use the Googleable version instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

following

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's defined in Matching_generic.ml, which is 'open'ed at the beginning of the file.
Neither >>= nor >||> are predefined OCaml operators. I've defined those operators
for the purpose of the matching process.

(* can match more *)
(env_add_matched_stmt xb >>= (fun () ->
(m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb)
))
) >!> (fun () ->
if !Flag.go_deeper_stmt
then
let xsb' = SubAST_generic.flatten_substmts_of_stmts (xb::xsb) in
m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb'
else fail ()
)

(* the general case *)
| xa::aas, xb::bbs ->
m_stmt xa xb >>= (fun () ->
env_add_matched_stmt xb >>= (fun () ->
m_stmts_deep aas bbs
))
| _::_, _ ->
fail ()

(*e: function [[Generic_vs_generic.m_stmts_deep]] *)

and _m_stmts (xsa: A.stmt list) (xsb: A.stmt list) =
Expand Down
43 changes: 28 additions & 15 deletions semgrep-core/matching/Semgrep_generic.ml
Expand Up @@ -58,19 +58,27 @@ type ('a, 'b) matcher = 'a -> 'b ->
(*****************************************************************************)

(*s: function [[Semgrep_generic.match_e_e]] *)
let match_e_e pattern e =
let match_e_e2 pattern e =
let env = Matching_generic.empty_environment () in
GG.m_expr pattern e env
(*e: function [[Semgrep_generic.match_e_e]] *)
let match_e_e ruleid a b =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about naming the wrapper match_e_e_profiled (et c. for other profiled calls)?

I know I was rather confused by the ...2 naming scheme when I first met this code base.

Common.profile_code "Semgrep.match_e_e" (fun () ->
Common.profile_code ("rule:" ^ ruleid) (fun () ->
match_e_e2 a b))

(*s: function [[Semgrep_generic.match_st_st]] *)
let match_st_st pattern e =
let match_st_st2 pattern e =
let env = Matching_generic.empty_environment () in
GG.m_stmt pattern e env
(*e: function [[Semgrep_generic.match_st_st]] *)
let match_st_st ruleid a b =
Common.profile_code "Semgrep.match_st_st" (fun () ->
Common.profile_code ("rule:" ^ ruleid) (fun () ->
match_st_st2 a b))

(*s: function [[Semgrep_generic.match_sts_sts]] *)
let match_sts_sts pattern e =
let match_sts_sts2 pattern e =
let env = Matching_generic.empty_environment () in
(* When matching statements, we need not only to report whether
* there is match, but also the actual statements that were matched.
Expand Down Expand Up @@ -106,6 +114,10 @@ let match_sts_sts pattern e =
| _ -> raise Impossible
)
(*e: function [[Semgrep_generic.match_sts_sts]] *)
let match_sts_sts ruleid a b =
Common.profile_code "Semgrep.match_sts_sts" (fun () ->
Common.profile_code ("rule:" ^ ruleid) (fun () ->
match_sts_sts2 a b))

(*s: function [[Semgrep_generic.match_any_any]] *)
(* for unit testing *)
Expand All @@ -119,11 +131,11 @@ let match_any_any pattern e =
(*****************************************************************************)

(*s: function [[Semgrep_generic.match_e_e_for_equivalences]] *)
let match_e_e_for_equivalences a b =
let match_e_e_for_equivalences ruleid a b =
Common.save_excursion Flag.equivalence_mode true (fun () ->
Common.save_excursion Flag.go_deeper_expr false (fun () ->
Common.save_excursion Flag.go_deeper_stmt false (fun () ->
match_e_e a b
match_e_e ruleid a b
)))
(*e: function [[Semgrep_generic.match_e_e_for_equivalences]] *)

Expand Down Expand Up @@ -159,7 +171,7 @@ let subst_e (bindings: MV.metavars_binding) e =
(*****************************************************************************)

(*s: function [[Semgrep_generic.apply_equivalences]] *)
let apply_equivalences equivs any =
let apply_equivalences2 equivs any =
let expr_rules = ref [] in
let stmt_rules = ref [] in

Expand Down Expand Up @@ -191,7 +203,8 @@ let apply_equivalences equivs any =
| [] -> x'
| (l, r)::xs ->
(* look for a match on original x, not x' *)
let matches_with_env = match_e_e_for_equivalences l x in
let matches_with_env = match_e_e_for_equivalences "<equivalence>"
l x in
(match matches_with_env with
(* todo: should generate a Disj for each possibilities? *)
| env::_xs ->
Expand All @@ -215,7 +228,9 @@ let apply_equivalences equivs any =
} in
visitor.M.vany any
(*e: function [[Semgrep_generic.apply_equivalences]] *)

let apply_equivalences a b =
Common.profile_code "Semgrep.apply_equivalences" (fun () ->
apply_equivalences2 a b)

(*****************************************************************************)
(* Main entry point *)
Expand Down Expand Up @@ -257,7 +272,7 @@ let check2 ~hook rules equivs file lang ast =
* against an expression recursively
*)
!expr_rules |> List.iter (fun (pattern, rule) ->
let matches_with_env = match_e_e pattern x in
let matches_with_env = match_e_e rule.R.id pattern x in
if matches_with_env <> []
then (* Found a match *)
matches_with_env |> List.iter (fun env ->
Expand All @@ -275,7 +290,7 @@ let check2 ~hook rules equivs file lang ast =
(* mostly copy paste of expr code but with the _st functions *)
V.kstmt = (fun (k, _) x ->
!stmt_rules |> List.iter (fun (pattern, rule) ->
let matches_with_env = match_st_st pattern x in
let matches_with_env = match_st_st rule.R.id pattern x in
if matches_with_env <> []
then (* Found a match *)
matches_with_env |> List.iter (fun env ->
Expand All @@ -295,7 +310,7 @@ let check2 ~hook rules equivs file lang ast =
* the heavy stuff (e.g., handling '...' between statements) rarely.
*)
!stmts_rules |> List.iter (fun (pattern, rule) ->
let matches_with_env = match_sts_sts pattern x in
let matches_with_env = match_sts_sts rule.R.id pattern x in
if matches_with_env <> []
then (* Found a match *)
matches_with_env |> List.iter (fun (env, matched_statements) ->
Expand All @@ -322,9 +337,7 @@ let check2 ~hook rules equivs file lang ast =
(*e: function [[Semgrep_generic.check2]] *)

(*s: function [[Semgrep_generic.check]] *)
let check ~hook rules equivs file lang =
Common.profile_code "Sgrep_generic.check" (
fun () -> check2 ~hook rules equivs file lang
)
let check ~hook a b c d e =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I prefer having the labeled arguments here 🤷

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, it's just that those Common.profile_code are just hacks because there's no super easy way to profile code. In theory I should just run ocamlprof and get nice stats, but I like the focused profile that allows Common.profile_code. Then I want to mimimize the amount of modifications I have to do to the program to support this non-functional property (profiling), so I do that. A better way probably would be to use the recent OCaml attribute to do that, have something like [@@ profile] let check a b c d = ... Maybe @mjambon knows a good ppx rewriter that support that.

Common.profile_code "Semgrep.check" (fun () -> check2 ~hook a b c d e)
(*e: function [[Semgrep_generic.check]] *)
(*e: semgrep/matching/Semgrep_generic.ml *)
2 changes: 1 addition & 1 deletion semgrep-core/matching/Semgrep_generic.mli
Expand Up @@ -20,7 +20,7 @@ type ('a, 'b) matcher = 'a -> 'b ->
(* used by tainting *)

(*s: signature [[Semgrep_generic.match_e_e]] *)
val match_e_e: (AST_generic.expr, AST_generic.expr) matcher
val match_e_e: string -> (AST_generic.expr, AST_generic.expr) matcher
(*e: signature [[Semgrep_generic.match_e_e]] *)

(*s: signature [[Semgrep_generic.match_any_any]] *)
Expand Down