Add more pre-commit hooks (#4943)

* Add more pre-commit hooks * Update pre-commit hooks * Fix pre-commit errors
semgrep · Apr 5, 2022 · 8b7d9e3 · 8b7d9e3
1 parent 1408398
commit 8b7d9e3
Show file tree

Hide file tree

Showing 45 changed files with 169 additions and 214 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,21 +1,37 @@
-exclude: "^semgrep/tests/e2e/(targets|snapshots)|semgrep-core/tests"
+exclude: "^semgrep/tests/e2e/(targets|snapshots)|^semgrep-core/tests|^semgrep/semgrep/external|\\binvalid\\b"
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.1.0
     hooks:
-      - id: check-yaml
-        args: [--allow-multiple-documents]
-        exclude: ^semgrep\/tests\/.+$|^perf\/bench\/gitlab-rules\/.+$$
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-        args: [--markdown-linebreak-ext=md]
-      - id: check-case-conflict
       - id: check-added-large-files
+      - id: check-ast
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: check-docstring-first
       - id: check-executables-have-shebangs
+      - id: check-json
       - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
       - id: check-symlinks
+      - id: check-toml
+      - id: check-vcs-permalinks
+      - id: check-xml
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: debug-statements
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
       - id: fix-byte-order-marker
+      - id: fix-encoding-pragma
+        args: [--remove]
+      - id: mixed-line-ending
+        args: [--fix=lf]
+      - id: no-commit-to-branch
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
 
   - repo: https://github.com/psf/black
     rev: 22.3.0
@@ -27,7 +43,13 @@ repos:
     rev: v3.0.1
     hooks:
       - id: reorder-python-imports
-        args: [--py37-plus]
+        args: ["--application-directories=.:semgrep", --py37-plus]
+
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v2.31.1
+    hooks:
+      - id: pyupgrade
+        args: ["--py37-plus"]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: "v0.942"
@@ -66,6 +88,17 @@ repos:
         additional_dependencies: ["flake8-bugbear==22.1.11"]
         args: ["--select=B,E9,F4,F63,F7,F82"]
 
+  - repo: https://github.com/myint/autoflake
+    rev: v1.4
+    hooks:
+      - id: autoflake
+        args:
+          - --in-place
+          - --remove-unused-variables
+          - --remove-duplicate-keys
+          - --remove-all-unused-imports
+          - --ignore-init-module-imports
+
   - repo: https://github.com/returntocorp/semgrep
     rev: "0.86.5-tmp-fix-pre-commit"
     hooks:

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -1,6 +1,5 @@
 {
-  // See https://go.microsoft.com/fwlink/?LinkId=733558
-  // for the documentation about the tasks.json format
+  "_comment": "See https://go.microsoft.com/fwlink/?LinkId=733558 for the documentation about the tasks.json format",
   "version": "2.0.0",
   "tasks": [
     {

diff --git a/perf/compare-bench-findings b/perf/compare-bench-findings
@@ -15,8 +15,8 @@ FINDINGS_SNAPSHOT_PATH = "snapshots/benchmark_findings.json"
 
 def findings_differ(expected: Dict[str, Any], findings: Dict[str, Any]) -> bool:
     name = findings["name"]
-    baseline = set(json.dumps(result) for result in expected["findings"]["results"])
-    latest = set(json.dumps(result) for result in findings["findings"]["results"])
+    baseline = {json.dumps(result) for result in expected["findings"]["results"]}
+    latest = {json.dumps(result) for result in findings["findings"]["results"]}
 
     def output_diff(diff: set) -> None:
         for d in sorted(diff):

diff --git a/perf/config.py b/perf/config.py
@@ -28,13 +28,13 @@
 
 
 @define
-class Repository(object):
+class Repository:
     url: str = field(default="")
     commit_hash: str = field(default="HEAD")
 
 
 @define
-class RuleConfig(object):
+class RuleConfig:
     config_str: str = field(default="")
 
     def _fetch_rule_config_from_url(self, rule_config_url: str) -> Optional[str]:
@@ -175,7 +175,7 @@ def resolve_to_cache(self, cache_path: Path) -> None:
 
 
 @define
-class BenchmarkRunSetupData(object):
+class BenchmarkRunSetupData:
     """
     Stores data about an individual benchmark run
     """
@@ -187,7 +187,7 @@ class BenchmarkRunSetupData(object):
 
 
 @define
-class SemgrepBenchmarkConfig(object):
+class SemgrepBenchmarkConfig:
     """
     Stores data needed to start a benchmarking run.
 
@@ -215,7 +215,7 @@ def parse_config(
         cls: Type["SemgrepBenchmarkConfig"], config_file: Path
     ) -> "SemgrepBenchmarkConfig":
         logger.debug(f"Using config at {config_file.absolute()}")
-        with open(config_file, "r") as fin:
+        with open(config_file) as fin:
             config = yaml.load(fin)
 
         return SemgrepBenchmarkConfig(

diff --git a/perf/r2c-rules/r2c-security-audit.yml b/perf/r2c-rules/r2c-security-audit.yml
@@ -3704,7 +3704,7 @@ rules:
         Scripting')"
       references:
         - https://www.developsec.com/2017/11/09/xss-in-a-script-tag/
-        - https://github.com/bkimminich/juice-shop/blob/master/routes/videoHandler.js#L64
+        - https://github.com/juice-shop/juice-shop/blob/1ceb8751e986dacd3214a618c37e7411be6bc11a/routes/videoHandler.ts#L68
     severity: WARNING
     languages:
       - javascript

diff --git a/perf/run-benchmarks b/perf/run-benchmarks
@@ -321,7 +321,7 @@ def prepare_rule_cache_for_this_run(
     logger.info(
         f"Rule cache for run {setup_data.run_name} created at {rule_cache_for_this_run}"
     )
-    rule_config_paths = list()
+    rule_config_paths = []
     for rule_config in setup_data.rule_configs:
         logger.info(f"Checking for rule config '{rule_config}' in cache")
         normalized_rule_config = rule_config.normalize_rule_config_name()
@@ -350,7 +350,7 @@ def prepare_benchmark_run(
     - downloads rule configs from an endpoint if necessary
     - generates a 'prep' file which clones a repo from a URL and checks out the commit
     """
-    corpuses: List[Corpus] = list()
+    corpuses: List[Corpus] = []
     for setup_data in benchmark_config.benchmark_setup_data:
         logger.info(f"Setting up benchmark run for run '{setup_data.run_name}'")
         rule_cache_dir, _ = prepare_rule_cache_for_this_run(setup_data, clean)

diff --git a/scripts/generate_cheatsheet.py b/scripts/generate_cheatsheet.py
@@ -169,9 +169,9 @@
     "deep": ["expr_operator"],
 }
 
-NUM_ALPHA_FEATURES = sum([len(val) for val in ALPHA_FEATURES.values()])
-NUM_BETA_FEATURES = sum([len(val) for val in BETA_FEATURES.values()])
-NUM_GA_FEATURES = sum([len(val) for val in GA_FEATURES.values()])
+NUM_ALPHA_FEATURES = sum(len(val) for val in ALPHA_FEATURES.values())
+NUM_BETA_FEATURES = sum(len(val) for val in BETA_FEATURES.values())
+NUM_GA_FEATURES = sum(len(val) for val in GA_FEATURES.values())
 
 
 def find_path(
@@ -228,8 +228,7 @@ def run_semgrep_on_example(
         print(">>> " + " ".join(cmd))
         output = subprocess.run(  # nosemgrep: python.lang.security.audit.dangerous-subprocess-use.dangerous-subprocess-use
             cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
         if output.returncode == 0:
             print(output.stderr.decode("utf-8"))

diff --git a/scripts/merge-rules.py b/scripts/merge-rules.py
diff --git a/semgrep-core/perf/input/l300.py b/semgrep-core/perf/input/l300.py
@@ -44,7 +44,7 @@ def pytest_runtest_setup(item):
     if "incremental" in item.keywords:
         previousfailed = getattr(item.parent, "_previousfailed", None)
         if previousfailed is not None:
-            pytest.xfail("previous test failed ({0})".format(previousfailed.name))
+            pytest.xfail(f"previous test failed ({previousfailed.name})")
 
 
 def pytest_runtest_makereport(item, call):

diff --git a/semgrep-core/scripts/run-coverage.py b/semgrep-core/scripts/run-coverage.py
@@ -9,7 +9,7 @@
 
 def report_summary_stat() -> str:
     stat = os.popen("bisect-ppx-report summary").read()
-    patt = re.compile("Coverage:\s+\d+/\d+\s+\((\d+\.\d*)%\)")
+    patt = re.compile(r"Coverage:\s+\d+/\d+\s+\((\d+\.\d*)%\)")
     # mobj = patt.match("Coverage: 4/4 (4.4%)")
     mobj = patt.match(stat)
     if mobj is not None:
@@ -19,7 +19,7 @@ def report_summary_stat() -> str:
 
 def report_summary_for_file_stat(file: str) -> str:
     stat = os.popen("bisect-ppx-report summary --per-file").readlines()
-    patt = re.compile(f"\s*(\d+.\d*)\s+%\s+\d+/\d+\s+{file}")
+    patt = re.compile(rf"\s*(\d+.\d*)\s+%\s+\d+/\d+\s+{file}")
     for line in stat:
         mobj = patt.match(line)
         if mobj is not None:

diff --git a/semgrep/dependencyparser/find_lockfiles.py b/semgrep/dependencyparser/find_lockfiles.py
@@ -23,9 +23,7 @@ def find_lockfiles(
             if entry.is_dir() and (
                 seen_paths is None or not (resolved_path in seen_paths)
             ):
-                new_paths = set([resolved_path]).union(
-                    seen_paths if seen_paths else set([])
-                )
+                new_paths = {resolved_path}.union(seen_paths if seen_paths else set())
                 yield from find_lockfiles(full_path, frozenset(new_paths))
             if entry.is_file() and entry.name.lower() in TARGET_LOCKFILE_FILENAMES:
                 yield full_path
diff --git a/semgrep/dependencyparser/package_restrictions.py b/semgrep/dependencyparser/package_restrictions.py
@@ -7,12 +7,12 @@
 from typing import Tuple
 
 import packaging.version
+from packaging.specifiers import SpecifierSet
+
 from dependencyparser.find_lockfiles import find_lockfiles
 from dependencyparser.models import LockfileDependency
 from dependencyparser.models import PackageManagers
 from dependencyparser.parse_lockfile import parse_lockfile_str
-from packaging.specifiers import SpecifierSet
-
 from semgrep.error import SemgrepError
 
 

diff --git a/semgrep/semgrep/commands/ci.py b/semgrep/semgrep/commands/ci.py
@@ -92,8 +92,7 @@ def fix_head_if_github_action(metadata: GitMeta) -> Iterator[None]:
             encoding="utf-8",
             check=True,
             timeout=GIT_SH_TIMEOUT,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
+            capture_output=True,
         )
         logger.debug(f"git rev-parse stdout: {rev_parse.stdout}")
         logger.debug(f"git rev-parse stderr: {rev_parse.stderr}")
@@ -105,8 +104,7 @@ def fix_head_if_github_action(metadata: GitMeta) -> Iterator[None]:
             ["git", "checkout", metadata.head_ref],
             encoding="utf-8",
             check=True,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
+            capture_output=True,
             timeout=GIT_SH_TIMEOUT,
         )
         logger.debug(f"git checkout stdout: {checkout.stdout}")
@@ -119,8 +117,7 @@ def fix_head_if_github_action(metadata: GitMeta) -> Iterator[None]:
             subprocess.run(
                 ["git", "checkout", stashed_rev],
                 encoding="utf-8",
-                stderr=subprocess.PIPE,
-                stdout=subprocess.PIPE,
+                capture_output=True,
                 check=True,
                 timeout=GIT_SH_TIMEOUT,
             )
@@ -393,7 +390,7 @@ def ci(
                     match for match in matches if not match.is_ignored
                 ]
 
-    num_cai_findings = sum(len(v) for v in cai_matches_by_rule.values())
+    sum(len(v) for v in cai_matches_by_rule.values())
     num_nonblocking_findings = sum(len(v) for v in nonblocking_matches_by_rule.values())
     num_blocking_findings = sum(len(v) for v in blocking_matches_by_rule.values())
 

diff --git a/semgrep/semgrep/config_resolver.py b/semgrep/semgrep/config_resolver.py
@@ -234,7 +234,7 @@ def _make_config_request(self) -> str:
                 "text/yaml",
                 "text/vnd.yaml",
             ]
-            if content_type and any((ct in content_type for ct in yaml_types)):
+            if content_type and any(ct in content_type for ct in yaml_types):
                 return r.content.decode("utf-8", errors="replace")
             else:
                 raise SemgrepError(
@@ -485,7 +485,7 @@ def parse_config_string(
     try:
         data = parse_yaml_preserve_spans(contents, filename)
         return {config_id: data}
-    except EmptyYamlException as se:
+    except EmptyYamlException:
         raise SemgrepError(
             f"Empty configuration file {filename}",
             code=UNPARSEABLE_YAML_EXIT_CODE,
@@ -641,7 +641,7 @@ def get_latest_version(ruleset_name: str) -> Version:
         for version_string in versions_json:
             try:
                 versions_parsed.append(Version(version_string))
-            except ValueError as e:
+            except ValueError:
                 logger.info(
                     f"Could not parse {version_string} in versions of {ruleset_name} pack as valid semver. Ignoring that version string."
                 )
@@ -811,7 +811,7 @@ def list_current_public_rulesets() -> List[JsonObject]:
     headers = {"User-Agent": SEMGREP_USER_AGENT}
     try:
         r = requests.get(api_full_url, headers=headers, timeout=20)
-    except Exception as e:
+    except Exception:
         raise SemgrepError(f"Failed to download list of public rulesets")
 
     if not r.ok:

diff --git a/semgrep/semgrep/core_output.py b/semgrep/semgrep/core_output.py
@@ -130,7 +130,7 @@ def parse(cls, raw_json: JsonObject) -> "CoreError":
         path = Path(location["path"])
         start = core.Position.from_json(location["start"])
         end = core.Position.from_json(location["end"])
-        _extra = raw_json.get("extra", {})
+        raw_json.get("extra", {})
         message = CoreErrorMessage(raw_json.get("message", "<no error message>"))
         level_str = raw_json["severity"]
         if level_str.upper() == "WARNING":

diff --git a/semgrep/semgrep/core_runner.py b/semgrep/semgrep/core_runner.py
@@ -264,7 +264,7 @@ class Task:
 class Plan(List[Task]):
     @property
     def rule_count(self) -> int:
-        return len(set(rule for task in self for rule in task.rule_ids))
+        return len({rule for task in self for rule in task.rule_ids})
 
     @property
     def file_count(self) -> int:
@@ -616,7 +616,6 @@ def _run_rules_direct_to_semgrep_core(
             stderr: Optional[int] = subprocess.PIPE
             if is_debug():
                 cmd += ["--debug"]
-                stderr = None
 
             if dump_command_for_core:
                 print(" ".join(cmd))

diff --git a/semgrep/semgrep/default_group.py b/semgrep/semgrep/default_group.py
@@ -39,7 +39,7 @@ def init()
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         default_command = kwargs.pop("default_command", None)
-        super(DefaultGroup, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.default_command_name = None
         if default_command is not None:
             self.default_command_name = default_command
@@ -59,7 +59,7 @@ def parse_args(self, ctx: click.Context, args: List[str]) -> List[str]:
         """
         if not args and self.default_command_name is not None:
             args.insert(0, self.default_command_name)
-        return super(DefaultGroup, self).parse_args(ctx, args)
+        return super().parse_args(ctx, args)
 
     def get_command(
         self, ctx: click.Context, command_name: str
@@ -76,7 +76,7 @@ def get_command(
             ctx._default_command_overwrite_args0 = command_name  # type: ignore
             command_name = self.default_command_name
 
-        return super(DefaultGroup, self).get_command(ctx, command_name)
+        return super().get_command(ctx, command_name)
 
     def resolve_command(
         self, ctx: click.Context, args: List[str]
@@ -92,7 +92,7 @@ def resolve_command(
         If args[0] is actually a command name then _default_command_overwrite_args0
         will not be set so this function is equivalent to existing behavior
         """
-        cmd_name, cmd, args = super(DefaultGroup, self).resolve_command(ctx, args)
+        cmd_name, cmd, args = super().resolve_command(ctx, args)
         if hasattr(ctx, "_default_command_overwrite_args0"):
             args.insert(0, ctx._default_command_overwrite_args0)  # type: ignore
         return cmd_name, cmd, args
diff --git a/semgrep/semgrep/dependency_aware_rule.py b/semgrep/semgrep/dependency_aware_rule.py
@@ -5,12 +5,11 @@
 from typing import List
 from typing import Tuple
 
+import semgrep.output_from_core as core
 from dependencyparser.models import PackageManagers
 from dependencyparser.package_restrictions import dependencies_range_match_any
 from dependencyparser.package_restrictions import find_and_parse_lockfiles
 from dependencyparser.package_restrictions import ProjectDependsOnEntry
-
-import semgrep.output_from_core as core
 from semgrep.error import SemgrepError
 from semgrep.rule import Rule
 from semgrep.rule_match import RuleMatch