New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize deep statement matching #852
Changes from all commits
b356625
a808f87
9fa5589
3d28f8f
868cc94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1259,13 +1259,74 @@ and m_other_attribute_operator = m_other_xxx | |
|
||
(*s: function [[Generic_vs_generic.m_stmts_deep]] *) | ||
and m_stmts_deep (xsa: A.stmt list) (xsb: A.stmt list) = | ||
if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa) | ||
then | ||
m_list__m_stmt xsa xsb >!> (fun () -> | ||
let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in | ||
m_list__m_stmt xsa xsb' | ||
) | ||
else m_list__m_stmt xsa xsb | ||
(* opti: this was the old code: | ||
* if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa) | ||
* then | ||
* m_list__m_stmt xsa xsb >!> (fun () -> | ||
* let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in | ||
* m_list__m_stmt xsa xsb' | ||
* ) | ||
* else m_list__m_stmt xsa xsb | ||
* | ||
* but this was really slow on huge files because with a pattern like | ||
* 'foo(); ...; bar();' we would call flatten_substmts_of_stmts | ||
* on each sequences in the program, even though foo(); was not | ||
* matched first. | ||
* Better to first match the first element, and if it matches and | ||
* we have a '...' that was not matched on the current sequence, | ||
* then we try with flatten_substmts_of_stmts. | ||
* | ||
* The code below is mostly a copy paste of m_list__m_stmt. We could | ||
* factorize, but I prefer to control and limit the number of places | ||
* where we call m_stmts_deep. Once we call m_list__m_stmt, we | ||
* are in a simpler world where the list of stmts will not grow. | ||
*) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for explaining the context and the intent |
||
match xsa, xsb with | ||
| [], [] -> | ||
return () | ||
(*s: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *) | ||
(* less-is-ok: | ||
* it's ok to have statements after in the concrete code as long as we | ||
* matched all the statements in the pattern (there is an implicit | ||
* '...' at the end, in addition to implicit '...' at the beginning | ||
* handled by kstmts calling the pattern for each subsequences). | ||
* TODO: sgrep_generic though then display the whole sequence as a match | ||
* instead of just the relevant part. | ||
*) | ||
| [], _::_ -> | ||
return () | ||
(*e: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *) | ||
Comment on lines
+1285
to
+1298
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the motivation to separate these two cases for documentation? (vs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just more precise. there is already a case above for [], [], so [], _ below would be more general that it needs to be. |
||
|
||
(* dots: '...', can also match no statement *) | ||
| [A.ExprStmt (A.Ellipsis _i)], [] -> | ||
return () | ||
|
||
| (A.ExprStmt (A.Ellipsis i))::xsa, xb::xsb -> | ||
(* let's first try the without going deep *) | ||
( | ||
(* can match nothing *) | ||
(m_list__m_stmt xsa (xb::xsb)) >||> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how does one get documentation on
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW: http://symbolhound.com/?q=%3E%7C%7C%3E+ocaml http://symbolhound.com/?q=%3E%21%3E+ocaml both return no results. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. following There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's defined in Matching_generic.ml, which is 'open'ed at the beginning of the file. |
||
(* can match more *) | ||
(env_add_matched_stmt xb >>= (fun () -> | ||
(m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb) | ||
)) | ||
) >!> (fun () -> | ||
if !Flag.go_deeper_stmt | ||
then | ||
let xsb' = SubAST_generic.flatten_substmts_of_stmts (xb::xsb) in | ||
m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb' | ||
else fail () | ||
) | ||
|
||
(* the general case *) | ||
| xa::aas, xb::bbs -> | ||
m_stmt xa xb >>= (fun () -> | ||
env_add_matched_stmt xb >>= (fun () -> | ||
m_stmts_deep aas bbs | ||
)) | ||
| _::_, _ -> | ||
fail () | ||
|
||
(*e: function [[Generic_vs_generic.m_stmts_deep]] *) | ||
|
||
and _m_stmts (xsa: A.stmt list) (xsb: A.stmt list) = | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,19 +58,27 @@ type ('a, 'b) matcher = 'a -> 'b -> | |
(*****************************************************************************) | ||
|
||
(*s: function [[Semgrep_generic.match_e_e]] *) | ||
let match_e_e pattern e = | ||
let match_e_e2 pattern e = | ||
let env = Matching_generic.empty_environment () in | ||
GG.m_expr pattern e env | ||
(*e: function [[Semgrep_generic.match_e_e]] *) | ||
let match_e_e ruleid a b = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about naming the wrapper I know I was rather confused by the |
||
Common.profile_code "Semgrep.match_e_e" (fun () -> | ||
Common.profile_code ("rule:" ^ ruleid) (fun () -> | ||
match_e_e2 a b)) | ||
|
||
(*s: function [[Semgrep_generic.match_st_st]] *) | ||
let match_st_st pattern e = | ||
let match_st_st2 pattern e = | ||
let env = Matching_generic.empty_environment () in | ||
GG.m_stmt pattern e env | ||
(*e: function [[Semgrep_generic.match_st_st]] *) | ||
let match_st_st ruleid a b = | ||
Common.profile_code "Semgrep.match_st_st" (fun () -> | ||
Common.profile_code ("rule:" ^ ruleid) (fun () -> | ||
match_st_st2 a b)) | ||
|
||
(*s: function [[Semgrep_generic.match_sts_sts]] *) | ||
let match_sts_sts pattern e = | ||
let match_sts_sts2 pattern e = | ||
let env = Matching_generic.empty_environment () in | ||
(* When matching statements, we need not only to report whether | ||
* there is match, but also the actual statements that were matched. | ||
|
@@ -106,6 +114,10 @@ let match_sts_sts pattern e = | |
| _ -> raise Impossible | ||
) | ||
(*e: function [[Semgrep_generic.match_sts_sts]] *) | ||
let match_sts_sts ruleid a b = | ||
Common.profile_code "Semgrep.match_sts_sts" (fun () -> | ||
Common.profile_code ("rule:" ^ ruleid) (fun () -> | ||
match_sts_sts2 a b)) | ||
|
||
(*s: function [[Semgrep_generic.match_any_any]] *) | ||
(* for unit testing *) | ||
|
@@ -119,11 +131,11 @@ let match_any_any pattern e = | |
(*****************************************************************************) | ||
|
||
(*s: function [[Semgrep_generic.match_e_e_for_equivalences]] *) | ||
let match_e_e_for_equivalences a b = | ||
let match_e_e_for_equivalences ruleid a b = | ||
Common.save_excursion Flag.equivalence_mode true (fun () -> | ||
Common.save_excursion Flag.go_deeper_expr false (fun () -> | ||
Common.save_excursion Flag.go_deeper_stmt false (fun () -> | ||
match_e_e a b | ||
match_e_e ruleid a b | ||
))) | ||
(*e: function [[Semgrep_generic.match_e_e_for_equivalences]] *) | ||
|
||
|
@@ -159,7 +171,7 @@ let subst_e (bindings: MV.metavars_binding) e = | |
(*****************************************************************************) | ||
|
||
(*s: function [[Semgrep_generic.apply_equivalences]] *) | ||
let apply_equivalences equivs any = | ||
let apply_equivalences2 equivs any = | ||
let expr_rules = ref [] in | ||
let stmt_rules = ref [] in | ||
|
||
|
@@ -191,7 +203,8 @@ let apply_equivalences equivs any = | |
| [] -> x' | ||
| (l, r)::xs -> | ||
(* look for a match on original x, not x' *) | ||
let matches_with_env = match_e_e_for_equivalences l x in | ||
let matches_with_env = match_e_e_for_equivalences "<equivalence>" | ||
l x in | ||
(match matches_with_env with | ||
(* todo: should generate a Disj for each possibilities? *) | ||
| env::_xs -> | ||
|
@@ -215,7 +228,9 @@ let apply_equivalences equivs any = | |
} in | ||
visitor.M.vany any | ||
(*e: function [[Semgrep_generic.apply_equivalences]] *) | ||
|
||
let apply_equivalences a b = | ||
Common.profile_code "Semgrep.apply_equivalences" (fun () -> | ||
apply_equivalences2 a b) | ||
|
||
(*****************************************************************************) | ||
(* Main entry point *) | ||
|
@@ -257,7 +272,7 @@ let check2 ~hook rules equivs file lang ast = | |
* against an expression recursively | ||
*) | ||
!expr_rules |> List.iter (fun (pattern, rule) -> | ||
let matches_with_env = match_e_e pattern x in | ||
let matches_with_env = match_e_e rule.R.id pattern x in | ||
if matches_with_env <> [] | ||
then (* Found a match *) | ||
matches_with_env |> List.iter (fun env -> | ||
|
@@ -275,7 +290,7 @@ let check2 ~hook rules equivs file lang ast = | |
(* mostly copy paste of expr code but with the _st functions *) | ||
V.kstmt = (fun (k, _) x -> | ||
!stmt_rules |> List.iter (fun (pattern, rule) -> | ||
let matches_with_env = match_st_st pattern x in | ||
let matches_with_env = match_st_st rule.R.id pattern x in | ||
if matches_with_env <> [] | ||
then (* Found a match *) | ||
matches_with_env |> List.iter (fun env -> | ||
|
@@ -295,7 +310,7 @@ let check2 ~hook rules equivs file lang ast = | |
* the heavy stuff (e.g., handling '...' between statements) rarely. | ||
*) | ||
!stmts_rules |> List.iter (fun (pattern, rule) -> | ||
let matches_with_env = match_sts_sts pattern x in | ||
let matches_with_env = match_sts_sts rule.R.id pattern x in | ||
if matches_with_env <> [] | ||
then (* Found a match *) | ||
matches_with_env |> List.iter (fun (env, matched_statements) -> | ||
|
@@ -322,9 +337,7 @@ let check2 ~hook rules equivs file lang ast = | |
(*e: function [[Semgrep_generic.check2]] *) | ||
|
||
(*s: function [[Semgrep_generic.check]] *) | ||
let check ~hook rules equivs file lang = | ||
Common.profile_code "Sgrep_generic.check" ( | ||
fun () -> check2 ~hook rules equivs file lang | ||
) | ||
let check ~hook a b c d e = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, I prefer having the labeled arguments here 🤷 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, it's just that those Common.profile_code are just hacks because there's no super easy way to profile code. In theory I should just run ocamlprof and get nice stats, but I like the focused profile that allows Common.profile_code. Then I want to mimimize the amount of modifications I have to do to the program to support this non-functional property (profiling), so I do that. A better way probably would be to use the recent OCaml attribute to do that, have something like [@@ profile] let check a b c d = ... Maybe @mjambon knows a good ppx rewriter that support that. |
||
Common.profile_code "Semgrep.check" (fun () -> check2 ~hook a b c d e) | ||
(*e: function [[Semgrep_generic.check]] *) | ||
(*e: semgrep/matching/Semgrep_generic.ml *) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rcoh maybe this was the issue. Maybe you were running the ocaml programs with profiling information but
because of -j the job was actually done in another process ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that occurred to me after I read that multi threading in OCaml is actually multiprocessing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, OCaml has concurrent threads (Xavier Leroy the author of OCaml actually added the first POSIX C thead library for Linux a long time ago, and he did it because he wanted threads in OCaml), but it does not have yet multi-core threads. There is work ongoing to suppor that.
Note that neither Python/PHP/Ruby/... have multi-core threads either.