semgrep · aryx · May 29, 2020 · May 28, 2020 · May 28, 2020 · May 28, 2020
diff --git a/docs/development.md b/docs/development.md
@@ -97,6 +97,60 @@ Set the OCAMLRUNPARAM environment variable to 'b' for backtrace. You will get be
 export OCAMLRUNPARAM=b
 ```
 
+## Profiling code
+
+You can pass the -profile command-line argument to semgrep-core to get
+a short profile of the code, for example:
+``` bash
+cd semgrep_core
+./_build/default/bin/Main.exe -profile -e foo tests/python
+---------------------
+profiling result
+---------------------
+Main total                               :      1.975 sec          1 count
+Parse_python.parse                       :      0.828 sec          1 count
+...
+```
+
+You can also instead set the environment variable SEMGREP_CORE_PROFILE to 1 to get the same information:
+
+``` bash
+cd semgrep_core
+export SEMGREP_CORE_PROFILE=1
+./_build/default/bin/Main.exe -e foo tests/python
+---------------------
+profiling result
+---------------------
+Main total                               :      1.975 sec          1 count
+Parse_python.parse                       :      0.828 sec          1 count
+...
+```
+
+This is especially useful when you don't call directly semgrep-core, but
+instead use the python wrapper semgrep.
+
+You can also use the SEMGREP_CORE_DEBUG environment variable to add debugging
+information, for example:
+```bash
+export SEMGREP_CORE_DEBUG=1
+export SEMGREP_CORE_PROFILE=1
+pipenv run semgrep -f ../semgrep-core/tests/PERF/ajin.yaml ../semgrep-core/tests/PERF/three.js
+Debug mode On
+Executed as: semgrep-core -lang javascript -rules_file /tmp/tmpy5pzp3p_ -j 8 ../semgrep-core/tests/PERF/three.js
+Profile mode On
+disabling -j when in profiling mode
+PARSING: ../semgrep-core/tests/PERF/three.js
+saving rules file for debugging in: /tmp/semgrep_core_rule-97ae74.yaml
+---------------------
+profiling result
+---------------------
+Main total                               :      1.975 sec          1 count
+Parse_js.parse                           :      0.828 sec          1 count
+Semgrep.check                            :      0.791 sec          1 count
+Semgrep.match_sts_sts                    :      0.559 sec     185064 count
+...
+```
+
 ## Testing
 
 ### semgrep-core

diff --git a/semgrep-core/bin/Main.ml b/semgrep-core/bin/Main.ml
@@ -48,12 +48,20 @@ module J = Json_type
 (* Flags *)
 (*****************************************************************************)
 
+(* You can set those environment variables to enable debugging/profiling
+ * instead of using -debug or -profile. This is useful when you don't call
+ * directly semgrep-core but instead use the semgrep Python wrapper.
+ *)
+let env_debug = "SEMGREP_CORE_DEBUG"
+let env_profile = "SEMGREP_CORE_PROFILE"
+
 (*s: constant [[Main_semgrep_core.verbose]] *)
 let verbose = ref false
 (*e: constant [[Main_semgrep_core.verbose]] *)
 (*s: constant [[Main_semgrep_core.debug]] *)
 let debug = ref false
 (*e: constant [[Main_semgrep_core.debug]] *)
+let profile = ref false
 (*s: constant [[Main_semgrep_core.error_recovery]] *)
 (* try to continue processing files, even if one has a parse error with -e/f.
  * note that -rules_file does its own error recovery.
@@ -420,6 +428,7 @@ let iter_generic_ast_of_files_and_get_matches_and_exn_to_errors f files =
            | [] ->
               failwith (spf "can not extract generic AST from %s" file)
          in
+         if !debug then pr2 (spf "PARSING: %s" file);
          let ast = parse_generic lang file in
 
          (* calling the hook *)
@@ -540,6 +549,16 @@ let sgrep_with_rules rules_file xs =
   print_matches_and_errors files matches errs
 (*e: function [[Main_semgrep_core.sgrep_with_rules]] *)
 
+(* when called from semgrep-python, error messages in semgrep-core or
+ * certain profiling statistics may refer to rule id that are generated
+ * by semgrep-python, making it hard to know what the problem is.
+ * At least we can save this generated rule file to help debugging.
+ *)
+let save_rules_file_in_tmp () =
+  let tmp = Filename.temp_file "semgrep_core_rule-" ".yaml" in
+  pr2 (spf "saving rules file for debugging in: %s" tmp);
+  Common.write_file ~file:tmp (Common.read_file !rules_file)
+
 (*****************************************************************************)
 (* Semgrep -tainting_rules_file *)
 (*****************************************************************************)
@@ -786,7 +805,15 @@ let options () =
   (*s: [[Main_semgrep_core.options]] concatenated flags *)
   Flag_parsing_cpp.cmdline_flags_macrofile () @
   (*x: [[Main_semgrep_core.options]] concatenated flags *)
-  Common2.cmdline_flags_devel () @
+  (* inlining of: Common2.cmdline_flags_devel () @ *)
+  [ "-debugger",         Arg.Set Common.debugger,
+    " option to set if launched inside ocamldebug";
+    "-profile",          Arg.Unit (fun () ->
+        Common.profile := Common.ProfAll;
+        profile := true;
+    ),
+    " output profiling information";
+  ] @
   (*x: [[Main_semgrep_core.options]] concatenated flags *)
   Meta_parse_info.cmdline_flags_precision () @
   (*x: [[Main_semgrep_core.options]] concatenated flags *)
@@ -843,8 +870,25 @@ let main () =
     spf "Usage: %s [options] <pattern> <files_or_dirs> \nOptions:"
       (Filename.basename Sys.argv.(0))
   in
+
+  let argv =
+   (Array.to_list Sys.argv) @
+   (if Sys.getenv_opt "SEMGREP_CORE_DEBUG" <> None then ["-debug"] else[])@
+   (if Sys.getenv_opt "SEMGREP_CORE_PROFILE" <> None then ["-profile"] else[])
+  in
+
   (* does side effect on many global flags *)
-  let args = Common.parse_options (options()) usage_msg Sys.argv in
+  let args = Common.parse_options (options()) usage_msg (Array.of_list argv) in
+
+  if !debug then begin
+    pr2 "Debug mode On";
+    pr2 (spf "Executed as: %s" (Sys.argv|>Array.to_list|> String.concat " "));
+  end;
+  if !profile then begin
+    pr2 "Profile mode On";
+    pr2 "disabling -j when in profiling mode";
+    ncores := 1;
+  end;
 
   (* must be done after Arg.parse, because Common.profile is set by it *)
   Common.profile_code "Main total" (fun () ->
@@ -868,13 +912,11 @@ let main () =
         (match () with
         (*s: [[Main_semgrep_core.main()]] main entry match cases *)
         | _ when !rules_file <> "" ->
-           (try  sgrep_with_rules !rules_file (x::xs)
+           (try
+               sgrep_with_rules !rules_file (x::xs);
+               if !profile then save_rules_file_in_tmp ();
             with exn -> begin
-             if Sys.getenv_opt "SEMGREP_CORE_DEBUG" <> None then begin
-               let tmp = Filename.temp_file "semgrep_core_rule-" ".yaml" in
-               pr2 (spf "saving rule file leading to the error in: %s" tmp);
-               Common.write_file ~file:tmp (Common.read_file !rules_file);
-             end;
+             if !debug then save_rules_file_in_tmp ();
              pr (format_output_exception exn);
              exit 2
              end

diff --git a/semgrep-core/matching/Generic_vs_generic.ml b/semgrep-core/matching/Generic_vs_generic.ml
@@ -1259,13 +1259,74 @@ and m_other_attribute_operator = m_other_xxx
 
 (*s: function [[Generic_vs_generic.m_stmts_deep]] *)
 and m_stmts_deep (xsa: A.stmt list) (xsb: A.stmt list) =
-  if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa)
-  then
-    m_list__m_stmt xsa xsb >!> (fun () ->
-      let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in
-      m_list__m_stmt xsa xsb'
-    )
-  else m_list__m_stmt xsa xsb
+  (* opti: this was the old code:
+   *   if !Flag.go_deeper_stmt && (has_ellipsis_stmts xsa)
+   *   then
+   *   m_list__m_stmt xsa xsb >!> (fun () ->
+   *     let xsb' = SubAST_generic.flatten_substmts_of_stmts xsb in
+   *     m_list__m_stmt xsa xsb'
+   *   )
+   *   else m_list__m_stmt xsa xsb
+   *
+   * but this was really slow on huge files because with a pattern like
+   * 'foo(); ...; bar();' we would call flatten_substmts_of_stmts
+   * on each sequences in the program, even though foo(); was not
+   * matched first.
+   * Better to first match the first element, and if it matches and
+   * we have a '...' that was not matched on the current sequence,
+   * then we try with flatten_substmts_of_stmts.
+   *
+   * The code below is mostly a copy paste of m_list__m_stmt. We could
+   * factorize, but I prefer to control and limit the number of places
+   * where we call m_stmts_deep. Once we call m_list__m_stmt, we
+   * are in a simpler world where the list of stmts will not grow.
+   *)
+  match xsa, xsb with
+  | [], [] ->
+      return ()
+  (*s: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *)
+  (* less-is-ok:
+   * it's ok to have statements after in the concrete code as long as we
+   * matched all the statements in the pattern (there is an implicit
+   * '...' at the end, in addition to implicit '...' at the beginning
+   * handled by kstmts calling the pattern for each subsequences).
+   * TODO: sgrep_generic though then display the whole sequence as a match
+   * instead of just the relevant part.
+   *)
+  | [], _::_ ->
+      return ()
+  (*e: [[Generic_vs_generic.m_list__m_stmt()]] empty list vs list case *)
+
+  (* dots: '...', can also match no statement *)
+  | [A.ExprStmt (A.Ellipsis _i)], [] ->
+      return ()
+
+  | (A.ExprStmt (A.Ellipsis i))::xsa, xb::xsb ->
+    (* let's first try the without going deep *)
+     (
+      (* can match nothing *)
+      (m_list__m_stmt xsa (xb::xsb)) >||>
+      (* can match more *)
+      (env_add_matched_stmt xb >>= (fun () ->
+       (m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb)
+      ))
+     ) >!> (fun () ->
+        if !Flag.go_deeper_stmt
+        then
+          let xsb' = SubAST_generic.flatten_substmts_of_stmts (xb::xsb) in
+          m_list__m_stmt ((A.ExprStmt (A.Ellipsis i))::xsa) xsb'
+        else fail ()
+     )
+
+  (* the general case *)
+  | xa::aas, xb::bbs ->
+      m_stmt xa xb >>= (fun () ->
+        env_add_matched_stmt xb >>= (fun () ->
+        m_stmts_deep aas bbs
+      ))
+  | _::_, _ ->
+      fail ()
+
 (*e: function [[Generic_vs_generic.m_stmts_deep]] *)
 
 and _m_stmts (xsa: A.stmt list) (xsb: A.stmt list) =

diff --git a/semgrep-core/matching/Semgrep_generic.ml b/semgrep-core/matching/Semgrep_generic.ml
@@ -58,19 +58,27 @@ type ('a, 'b) matcher = 'a -> 'b ->
 (*****************************************************************************)
 
 (*s: function [[Semgrep_generic.match_e_e]] *)
-let match_e_e pattern e =
+let match_e_e2 pattern e =
   let env = Matching_generic.empty_environment () in
   GG.m_expr pattern e env
 (*e: function [[Semgrep_generic.match_e_e]] *)
+let match_e_e ruleid a b =
+ Common.profile_code "Semgrep.match_e_e" (fun () ->
+    Common.profile_code ("rule:" ^ ruleid) (fun () ->
+      match_e_e2 a b))
 
 (*s: function [[Semgrep_generic.match_st_st]] *)
-let match_st_st pattern e =
+let match_st_st2 pattern e =
   let env = Matching_generic.empty_environment () in
   GG.m_stmt pattern e env
 (*e: function [[Semgrep_generic.match_st_st]] *)
+let match_st_st ruleid a b =
+  Common.profile_code "Semgrep.match_st_st" (fun () ->
+    Common.profile_code ("rule:" ^ ruleid) (fun () ->
+      match_st_st2 a b))
 
 (*s: function [[Semgrep_generic.match_sts_sts]] *)
-let match_sts_sts pattern e =
+let match_sts_sts2 pattern e =
   let env = Matching_generic.empty_environment () in
   (* When matching statements, we need not only to report whether
    * there is match, but also the actual statements that were matched.
@@ -106,6 +114,10 @@ let match_sts_sts pattern e =
     | _ -> raise Impossible
   )
 (*e: function [[Semgrep_generic.match_sts_sts]] *)
+let match_sts_sts ruleid a b =
+  Common.profile_code "Semgrep.match_sts_sts" (fun () ->
+    Common.profile_code ("rule:" ^ ruleid) (fun () ->
+      match_sts_sts2 a b))
 
 (*s: function [[Semgrep_generic.match_any_any]] *)
 (* for unit testing *)
@@ -119,11 +131,11 @@ let match_any_any pattern e =
 (*****************************************************************************)
 
 (*s: function [[Semgrep_generic.match_e_e_for_equivalences]] *)
-let match_e_e_for_equivalences a b =
+let match_e_e_for_equivalences ruleid a b =
   Common.save_excursion Flag.equivalence_mode true (fun () ->
   Common.save_excursion Flag.go_deeper_expr false (fun () ->
   Common.save_excursion Flag.go_deeper_stmt false (fun () ->
-    match_e_e a b
+    match_e_e ruleid a b
   )))
 (*e: function [[Semgrep_generic.match_e_e_for_equivalences]] *)
 
@@ -159,7 +171,7 @@ let subst_e (bindings: MV.metavars_binding) e =
 (*****************************************************************************)
 
 (*s: function [[Semgrep_generic.apply_equivalences]] *)
-let apply_equivalences equivs any =
+let apply_equivalences2 equivs any =
   let expr_rules = ref [] in
   let stmt_rules = ref [] in
 
@@ -191,7 +203,8 @@ let apply_equivalences equivs any =
          | [] -> x'
          | (l, r)::xs ->
            (* look for a match on original x, not x' *)
-           let matches_with_env = match_e_e_for_equivalences l x in
+           let matches_with_env = match_e_e_for_equivalences "<equivalence>"
+                    l x in
            (match matches_with_env with
            (* todo: should generate a Disj for each possibilities? *)
            | env::_xs ->
@@ -215,7 +228,9 @@ let apply_equivalences equivs any =
    } in
   visitor.M.vany any
 (*e: function [[Semgrep_generic.apply_equivalences]] *)
-
+let apply_equivalences a b =
+  Common.profile_code "Semgrep.apply_equivalences" (fun () ->
+      apply_equivalences2 a b)
 
 (*****************************************************************************)
 (* Main entry point *)
@@ -257,7 +272,7 @@ let check2 ~hook rules equivs file lang ast =
        * against an expression recursively
        *)
       !expr_rules |> List.iter (fun (pattern, rule) ->
-         let matches_with_env = match_e_e pattern x in
+         let matches_with_env = match_e_e rule.R.id pattern x in
          if matches_with_env <> []
          then (* Found a match *)
            matches_with_env |> List.iter (fun env ->
@@ -275,7 +290,7 @@ let check2 ~hook rules equivs file lang ast =
     (* mostly copy paste of expr code but with the _st functions *)
     V.kstmt = (fun (k, _) x ->
       !stmt_rules |> List.iter (fun (pattern, rule) ->
-         let matches_with_env = match_st_st pattern x in
+         let matches_with_env = match_st_st rule.R.id pattern x in
          if matches_with_env <> []
          then (* Found a match *)
            matches_with_env |> List.iter (fun env ->
@@ -295,7 +310,7 @@ let check2 ~hook rules equivs file lang ast =
        * the heavy stuff (e.g., handling '...' between statements) rarely.
        *)
       !stmts_rules |> List.iter (fun (pattern, rule) ->
-         let matches_with_env = match_sts_sts pattern x in
+         let matches_with_env = match_sts_sts rule.R.id pattern x in
          if matches_with_env <> []
          then (* Found a match *)
            matches_with_env |> List.iter (fun (env, matched_statements) ->
@@ -322,9 +337,7 @@ let check2 ~hook rules equivs file lang ast =
 (*e: function [[Semgrep_generic.check2]] *)
 
 (*s: function [[Semgrep_generic.check]] *)
-let check ~hook rules equivs file lang =
-  Common.profile_code "Sgrep_generic.check" (
-    fun () -> check2 ~hook rules equivs file lang
-  )
+let check ~hook a b c d e =
+  Common.profile_code "Semgrep.check" (fun () -> check2 ~hook a b c d e)
 (*e: function [[Semgrep_generic.check]] *)
 (*e: semgrep/matching/Semgrep_generic.ml *)
diff --git a/semgrep-core/matching/Semgrep_generic.mli b/semgrep-core/matching/Semgrep_generic.mli
@@ -20,7 +20,7 @@ type ('a, 'b) matcher = 'a -> 'b ->
 (* used by tainting *)
 
 (*s: signature [[Semgrep_generic.match_e_e]] *)
-val match_e_e: (AST_generic.expr, AST_generic.expr) matcher
+val match_e_e: string -> (AST_generic.expr, AST_generic.expr) matcher
 (*e: signature [[Semgrep_generic.match_e_e]] *)
 
 (*s: signature [[Semgrep_generic.match_any_any]] *)