semgrep · minusworld · Mar 26, 2021 · Mar 25, 2021 · Mar 25, 2021 · Mar 26, 2021
diff --git a/semgrep/semgrep/cli.py b/semgrep/semgrep/cli.py
@@ -324,6 +324,11 @@ def cli() -> None:
             "containing a 'nosem' comment at the end."
         ),
     )
+    output.add_argument(
+        "--experimental",
+        action="store_true",
+        help="Pass rules directly to Semgrep core. This will use the logic evaluation available in Semgrep core.",
+    )
 
     output.add_argument(
         MAX_LINES_FLAG_NAME,
@@ -476,4 +481,5 @@ def cli() -> None:
                 skip_unknown_extensions=args.skip_unknown_extensions,
                 severity=args.severity,
                 report_time=args.json_time,
+                experimental=args.experimental,
             )
diff --git a/semgrep/semgrep/core_runner.py b/semgrep/semgrep/core_runner.py
@@ -535,10 +535,97 @@ def _run_rules(
             match_time_matrix,
         )
 
+    def _run_rules_direct_to_semgrep_core(
+        self,
+        rules: List[Rule],
+        target_manager: TargetManager,
+        profiler: ProfileManager,
+    ) -> Tuple[
+        Dict[Rule, List[RuleMatch]],
+        Dict[Rule, List[Any]],
+        List[SemgrepError],
+        Set[Path],
+        Dict[Any, Any],
+    ]:
+        from itertools import chain
+        from collections import defaultdict
+
+        outputs: Dict[Rule, List[RuleMatch]] = defaultdict(list)
+        errors: List[SemgrepError] = []
+        for rule, language in tuple(
+            chain(
+                *([(rule, language) for language in rule.languages] for rule in rules)
+            )
+        ):
+            with tempfile.NamedTemporaryFile(
+                "w", suffix=".yaml"
+            ) as rule_file, tempfile.NamedTemporaryFile("w") as target_file:
+                targets = self.get_files_for_language(language, rule, target_manager)
+                target_file.write("\n".join(map(lambda p: str(p), targets)))
+                target_file.flush()
+                yaml = YAML()
+                yaml.dump({"rules": [rule._raw]}, rule_file)
+                rule_file.flush()
+
+                cmd = [SEMGREP_PATH] + [
+                    "-lang",
+                    language,
+                    "-fast",
+                    "-json",
+                    "-config",
+                    rule_file.name,
+                    "-j",
+                    str(self._jobs),
+                    "-target_file",
+                    target_file.name,
+                    "-timeout",
+                    str(self._timeout),
+                    "-max_memory",
+                    str(self._max_memory),
+                ]
+
+                r = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                out_bytes, err_bytes, returncode = r.stdout, r.stderr, r.returncode
+                output_json = self._parse_core_output(out_bytes, err_bytes, returncode)
+
+                if returncode != 0:
+                    if "error" in output_json:
+                        self._raise_semgrep_error_from_json(output_json, [], rule)
+                    else:
+                        raise SemgrepError(
+                            f"unexpected json output while invoking semgrep-core with rule '{rule.id}':\n{PLEASE_FILE_ISSUE_TEXT}"
+                        )
+
+            # end with tempfile.NamedTemporaryFile(...) ...
+            outputs[rule].extend(
+                [
+                    RuleMatch.from_pattern_match(
+                        rule.id,
+                        PatternMatch(pattern_match),
+                        message=rule.message,
+                        metadata=rule.metadata,
+                        severity=rule.severity,
+                        fix=rule.fix,
+                        fix_regex=rule.fix_regex,
+                    )
+                    for pattern_match in output_json["matches"]
+                ]
+            )
+            errors.extend(
+                CoreException.from_json(e, language, rule.id).into_semgrep_error()
+                for e in output_json["errors"]
+            )
+        # end for rule, language ...
+
+        return outputs, {}, errors, set(Path(p) for p in target_manager.targets), {}
+
+    # end _run_rules_direct_to_semgrep_core
+
     def invoke_semgrep(
         self,
         target_manager: TargetManager,
         rules: List[Rule],
+        experimental: bool = False,
     ) -> Tuple[
         Dict[Rule, List[RuleMatch]],
         Dict[Rule, List[Dict[str, Any]]],
@@ -553,13 +640,16 @@ def invoke_semgrep(
         start = datetime.now()
         profiler = ProfileManager()
 
+        runner_fxn = (
+            self._run_rules_direct_to_semgrep_core if experimental else self._run_rules
+        )
         (
             findings_by_rule,
             debug_steps_by_rule,
             errors,
             all_targets,
             match_time_matrix,
-        ) = self._run_rules(rules, target_manager, profiler)
+        ) = runner_fxn(rules, target_manager, profiler)
 
         logger.debug(
             f"semgrep ran in {datetime.now() - start} on {len(all_targets)} files"

diff --git a/semgrep/semgrep/semgrep_main.py b/semgrep/semgrep/semgrep_main.py
@@ -183,6 +183,7 @@ def main(
     skip_unknown_extensions: bool = False,
     severity: Optional[List[str]] = None,
     report_time: bool = False,
+    experimental: bool = False,
 ) -> None:
     if include is None:
         include = []
@@ -263,7 +264,7 @@ def main(
         timeout_threshold=timeout_threshold,
         report_time=report_time,
     ).invoke_semgrep(
-        target_manager, filtered_rules
+        target_manager, filtered_rules, experimental
     )
 
     output_handler.handle_semgrep_errors(semgrep_errors)

diff --git a/semgrep/semgrep/test.py b/semgrep/semgrep/test.py
@@ -272,6 +272,7 @@ def generate_file_pairs(
     unsafe: bool,
     json_output: bool,
     save_test_output_tar: bool = True,
+    experimental: bool = False,
 ) -> None:
     configs = list(config.rglob("*"))
     targets = list(target.rglob("*"))
@@ -303,6 +304,7 @@ def generate_file_pairs(
         no_rewrite_rule_ids=True,
         strict=strict,
         dangerously_allow_arbitrary_code_execution_from_rules=unsafe,
+        experimental=experimental,
     )
     with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
         results = pool.starmap(invoke_semgrep_fn, config_with_tests)
@@ -435,4 +437,5 @@ def test_main(args: argparse.Namespace) -> None:
         args.dangerously_allow_arbitrary_code_execution_from_rules,
         args.json,
         args.save_test_output_tar,
+        experimental=args.experimental,
     )