semgrep · mjambon · Mar 21, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 18, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,8 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
 
 ### Fixed
 
+- Entropy analysis: strings made of repeated characters such as
+  `'xxxxxxxxxxxxxx'` are no longer reported has having high entropy (#4833)
 - Symlinks found in directories are skipped from being scanned again.
   This is a fix for a regression introduced in 0.85.0.
 

diff --git a/semgrep-core/src/core/ast/AST_generic.ml b/semgrep-core/src/core/ast/AST_generic.ml
@@ -536,7 +536,21 @@ and literal =
   | Int of int option wrap
   | Float of float option wrap
   | Char of string wrap
-  | String of string wrap (* TODO? bracket, ', ", or even """ *)
+  (* String literals:
+     The token includes the quotes (if any) but the string value excludes them.
+     The value is the escaped string content. For example,
+     The escaped content of the Python string literal '\\(\\)' is \\(\\).
+     The unescaped content would be \(\).
+     TODO: use bracket instead of wrap, ', ", or even """
+     TODO: expose the unescaped contents if known, so that we could analyze
+     string contents correctly.
+     An incremental change could be:
+
+     | String of (string * string option) bracket
+                  ^^^^^^   ^^^^^^
+                  escaped  unescaped
+  *)
+  | String of string wrap
   | Regexp of string wrap bracket (* // *) * string wrap option (* modifiers *)
   | Atom of tok (* ':' in Ruby, ''' in Scala *) * string wrap
   | Unit of tok

diff --git a/semgrep-core/src/engine/Entropy.ml b/semgrep-core/src/engine/Entropy.ml
@@ -128,7 +128,25 @@ let entropy_from_trigrams s =
       e := !e +. get_substring_entropy sub);
   !e
 
-let entropy s = entropy_from_trigrams s
+(*
+   This test is useful for long chains of the same character such
+   as "xxxxxxxxxxxxxxxxxxxxxxx".
+   Shorter strings have low entropy anyway.
+
+   This works only with ascii characters but that's probably good enough.
+*)
+let is_repeated_char s =
+  if s = "" then false
+  else
+    let c0 = s.[0] in
+    try
+      String.iter (fun c -> if c <> c0 then raise Exit) s;
+      true
+    with Exit -> false
+
+let entropy s =
+  if is_repeated_char s then (* somewhat arbitrary low entropy *) 1.
+  else entropy_from_trigrams s
 
 let entropy_data s =
   let ent = entropy s in

diff --git a/semgrep-core/src/engine/Eval_generic.ml b/semgrep-core/src/engine/Eval_generic.ml
@@ -311,7 +311,7 @@ let text_of_binding mvar mval =
       (* Note that `text` may be produced by constant folding, in which
        * case we will not have range info. *)
       Some text
-  | ___else___ -> (
+  | _ -> (
       let any = MV.mvalue_to_any mval in
       match Visitor_AST.range_of_any_opt any with
       | None ->

diff --git a/semgrep-core/src/engine/Match_search_rules.ml b/semgrep-core/src/engine/Match_search_rules.ml
@@ -615,7 +615,7 @@ let matches_of_xpatterns config file_and_more xpatterns =
    Find the metavariable value, convert it back to a string, and
    run it through an analyzer that returns true if there's a "match".
 *)
-let analyze_metavar env bindings mvar analyzer =
+let analyze_metavar env (bindings : MV.bindings) mvar analyzer =
   match List.assoc_opt mvar bindings with
   | None ->
       error env
@@ -624,10 +624,24 @@ let analyze_metavar env bindings mvar analyzer =
             check your rule"
            mvar);
       false
-  | Some mval -> (
-      match Eval_generic.text_of_binding mvar mval with
-      | None -> false
-      | Some data -> analyzer data)
+  | Some mvalue -> analyzer mvalue
+
+(*
+   Analyze the contents of a string literal bound to a metavariable.
+   The analyzer operates of the strings contents after best-effort
+   unescaping.
+   Return false if the bound value isn't a string literal.
+*)
+let analyze_string_metavar env bindings mvar (analyzer : string -> bool) =
+  analyze_metavar env bindings mvar (function
+    (* We don't use Eval_generic.text_of_binding on string literals because
+       it returns the quoted string but we want it unquoted. *)
+    | E { G.e = G.L (G.String (escaped, _tok)); _ } ->
+        escaped |> String_literal.approximate_unescape |> analyzer
+    | other_mval -> (
+        match Eval_generic.text_of_binding mvar other_mval with
+        | Some s -> analyzer s
+        | None -> false))
 
 let rec filter_ranges env xs cond =
   xs
@@ -676,7 +690,7 @@ let rec filter_ranges env xs cond =
              Eval_generic.eval_bool env e
          | R.CondAnalysis (mvar, CondEntropy) ->
              let bindings = r.mvars in
-             analyze_metavar env bindings mvar Entropy.has_high_score
+             analyze_string_metavar env bindings mvar Entropy.has_high_score
          | R.CondAnalysis (mvar, CondReDoS) ->
              let bindings = r.mvars in
              let analyze re_str =
@@ -699,7 +713,7 @@ let rec filter_ranges env xs cond =
                      mvar re_str;
                    false
              in
-             analyze_metavar env bindings mvar analyze)
+             analyze_string_metavar env bindings mvar analyze)
 
 and satisfies_metavar_pattern_condition env r mvar opt_xlang formula =
   let bindings = r.mvars in

diff --git a/semgrep-core/src/engine/ReDoS.ml b/semgrep-core/src/engine/ReDoS.ml
@@ -175,39 +175,6 @@ let find_vulnerable_nodes =
              (matches_deep matches_in_two_nonempty_branches)))
        matches_not_everywhere)
 
-(*
-   Assume:
-   \\ -> \
-   \' -> '
-   \" -> "
-*)
-let unescape =
-  let rex = SPcre.regexp "\\\\[\\\\'\"]" in
-  fun s ->
-    Pcre.substitute ~rex
-      ~subst:(fun s ->
-        assert (String.length s = 2);
-        String.sub s 1 1)
-      s
-
-(*
-   HACK!
-   Remove the quotes from string literals because string literals are
-   what we're getting. The generic AST should offer a view into
-   string content rather than the original quoted and escaped literals.
-*)
-let unquote s =
-  let len = String.length s in
-  if len < 2 then s
-  else
-    let first = s.[0] in
-    let last = s.[len - 1] in
-    match (first, last) with
-    | '\'', '\''
-    | '"', '"' ->
-        String.sub s 1 (len - 2) |> unescape
-    | _ -> s
-
 (* Take the requested substring if possible, otherwise return the original
    string. *)
 let safe_sub s pos sublen =
@@ -227,7 +194,6 @@ let recover_source re_str node =
   safe_sub re_str first_tok_pos len
 
 let find_vulnerable_subpatterns ?(dialect = Dialect.PCRE) re_str =
-  let re_str = (* TODO: take an already unquoted string *) unquote re_str in
   let conf = Dialect.conf dialect in
   match parse_regexp conf re_str with
   | None -> Error ()

diff --git a/semgrep-core/src/engine/ReDoS.mli b/semgrep-core/src/engine/ReDoS.mli
@@ -21,6 +21,3 @@ val find_vulnerable_subpatterns :
 *)
 val regexp_may_be_vulnerable :
   ?dialect:Pfff_lang_regexp.Dialect.t -> string -> bool
-
-(* Exposed only for testing purposes *)
-val unquote : string -> string
diff --git a/semgrep-core/src/engine/String_literal.ml b/semgrep-core/src/engine/String_literal.ml
@@ -0,0 +1,16 @@
+(* Evaluate the contents of string literals *)
+
+(*
+   Assume:
+   \\ -> \
+   \' -> '
+   \" -> "
+*)
+let approximate_unescape =
+  let rex = SPcre.regexp "\\\\[\\\\'\"]" in
+  fun s ->
+    Pcre.substitute ~rex
+      ~subst:(fun s ->
+        assert (String.length s = 2);
+        String.sub s 1 1)
+      s
diff --git a/semgrep-core/src/engine/String_literal.mli b/semgrep-core/src/engine/String_literal.mli
@@ -0,0 +1,24 @@
+(*
+   This module is a hack for evaluating string literals.
+*)
+
+(*
+   Evaluate unquoted, escaped string literal contents.
+
+   abc\\\'\"def  ->  abc\'"def
+
+   Assumes:
+
+   \\ -> \
+   \' -> '
+   \" -> "
+
+   Other escape sequences are left untouched.
+
+   HACK!
+
+   Different languages use different escaping rules for their string
+   literals. This is incorrect but can be useful until each language
+   parser exposes unescaped strings.
+*)
+val approximate_unescape : string -> string
diff --git a/semgrep-core/src/engine/Unit_ReDoS.ml b/semgrep-core/src/engine/Unit_ReDoS.ml
@@ -4,36 +4,33 @@
 
 open Printf
 
-let quoted_strings =
+let escaped_strings =
   [
     (* input, expected output *)
     ({||}, {||});
     ({|a|}, {|a|});
     ({|'|}, {|'|});
     ({|"|}, {|"|});
+    ({|\'|}, {|'|});
+    ({|\"|}, {|"|});
+    ({|\\|}, {|\|});
     ({|ab|}, {|ab|});
-    ({|\\\|}, {|\\\|});
-    ({|"\\\'|}, {|"\\\'|});
-    ({|''|}, {||});
-    ({|""|}, {||});
-    ({|'ab'|}, {|ab|});
-    ({|'a"b'|}, {|a"b|});
-    ({|"a'b"|}, {|a'b|});
-    ({|'a'b'|}, {|a'b|});
-    ({|"a"b"|}, {|a"b|});
-    ({|'a\b'|}, {|a\b|});
-    ({|"a\b"|}, {|a\b|});
-    ({|'a\\b'|}, {|a\b|});
-    ({|"a\\b"|}, {|a\b|});
-    ({|'a\\\b'|}, {|a\\b|});
-    ({|"a\\\b"|}, {|a\\b|});
+    ({|\\\|}, {|\\|});
+    ({|"\\\'|}, {|"\'|});
+    ({|''|}, {|''|});
+    ({|""|}, {|""|});
+    ({|'ab'|}, {|'ab'|});
+    ({|'a"b'|}, {|'a"b'|});
+    ({|"a'b"|}, {|"a'b"|});
   ]
 
-let test_unquote () =
-  quoted_strings
+let test_unescape () =
+  escaped_strings
   |> List.iter (fun (input, expected_output) ->
-         let output = ReDoS.unquote input in
-         Alcotest.(check string) ("unquote " ^ input) expected_output output)
+         let output = String_literal.approximate_unescape input in
+         Alcotest.(check string)
+           ("String_literal.approximate_unescape " ^ input)
+           expected_output output)
 
 type result = Succeeds | Blows_up
 
@@ -186,7 +183,7 @@ let test_vulnerability_prediction () =
 
 let tests =
   [
-    ("unquote", test_unquote);
+    ("unescape", test_unescape);
     ("pcre pattern explosion", test_pcre_pattern_explosions);
     ("vulnerability prediction", test_vulnerability_prediction);
   ]
diff --git a/semgrep-core/src/engine/Unit_entropy.ml b/semgrep-core/src/engine/Unit_entropy.ml
@@ -52,6 +52,8 @@ let low_score_strings =
     "password123";
     "someReallyLongComplicatedObjectiveCMethodName";
     "{$variable.someProperty}";
+    "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
     (* grey zone *)
     "ringerringerringerringerringerringerringerringerringerringerringer";
     "Many hands make light work.";

diff --git a/semgrep-core/tests/OTHER/rules/entropy_python.py b/semgrep-core/tests/OTHER/rules/entropy_python.py
@@ -0,0 +1,5 @@
+# ruleid: entropy-python
+secret = 'feoi7n978faseduy987ybf67t4ni867fased6tbi6fa34nf97'
+
+secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+secret = 'secret'
diff --git a/semgrep-core/tests/OTHER/rules/entropy_python.yaml b/semgrep-core/tests/OTHER/rules/entropy_python.yaml
@@ -0,0 +1,11 @@
+rules:
+  - id: entropy-python
+    patterns:
+      - pattern: $A = $B
+      - metavariable-analysis:
+          analyzer: entropy
+          metavariable: $B
+    message: found high-entropy string
+    languages:
+      - python
+    severity: ERROR
diff --git a/semgrep-core/tests/OTHER/rules/redos_python.py b/semgrep-core/tests/OTHER/rules/redos_python.py
@@ -0,0 +1,18 @@
+import re
+
+# ruleid: redos-python
+re.match('(a+)+$', ...)
+
+re.match('hello+', ...)
+
+def test_patterns(input_str):
+    # ruleid: redos-python
+    vulnerable = re.match('((honk )+)+$', input_str)
+    ok         = re.match('((honk )+)++$', input_str)
+
+def validate_email(email_str):
+    # ruleid: redos-python
+    re.match(
+        '^([a-zA-Z0-9])(([\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$',
+        email_str
+    )
diff --git a/semgrep-core/tests/OTHER/rules/redos_python.yaml b/semgrep-core/tests/OTHER/rules/redos_python.yaml
@@ -0,0 +1,18 @@
+rules:
+  - id: redos-python
+    patterns:
+      - pattern: re.match($PAT, ...)
+      - metavariable-analysis:
+          analyzer: redos
+          metavariable: $PAT
+    message: |
+      Found a regex potentially vulnerable to regex denial-of-service
+      attacks (ReDoS). Consider the use of possessive quantifiers to
+      disable backtracking where it isn't necessary e.g. instead of '[a-z]*',
+      write '[a-z]*+'.
+    metadata:
+      references:
+        - https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
+    languages:
+      - python
+    severity: ERROR