Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Identify strings made of repeated characters as having low entropy #4833

Merged
merged 8 commits into from Mar 21, 2022
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -23,6 +23,8 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html

### Fixed

- Entropy analysis: strings made of repeated characters such as
`'xxxxxxxxxxxxxx'` are no longer reported has having high entropy (#4833)
- Symlinks found in directories are skipped from being scanned again.
This is a fix for a regression introduced in 0.85.0.

Expand Down
16 changes: 15 additions & 1 deletion semgrep-core/src/core/ast/AST_generic.ml
Expand Up @@ -536,7 +536,21 @@ and literal =
| Int of int option wrap
| Float of float option wrap
| Char of string wrap
| String of string wrap (* TODO? bracket, ', ", or even """ *)
(* String literals:
The token includes the quotes (if any) but the string value excludes them.
The value is the escaped string content. For example,
The escaped content of the Python string literal '\\(\\)' is \\(\\).
The unescaped content would be \(\).
TODO: use bracket instead of wrap, ', ", or even """
TODO: expose the unescaped contents if known, so that we could analyze
string contents correctly.
An incremental change could be:

| String of (string * string option) bracket
^^^^^^ ^^^^^^
escaped unescaped
*)
| String of string wrap
| Regexp of string wrap bracket (* // *) * string wrap option (* modifiers *)
| Atom of tok (* ':' in Ruby, ''' in Scala *) * string wrap
| Unit of tok
Expand Down
20 changes: 19 additions & 1 deletion semgrep-core/src/engine/Entropy.ml
Expand Up @@ -128,7 +128,25 @@ let entropy_from_trigrams s =
e := !e +. get_substring_entropy sub);
!e

let entropy s = entropy_from_trigrams s
(*
This test is useful for long chains of the same character such
as "xxxxxxxxxxxxxxxxxxxxxxx".
Shorter strings have low entropy anyway.

This works only with ascii characters but that's probably good enough.
*)
let is_repeated_char s =
if s = "" then false
else
let c0 = s.[0] in
try
String.iter (fun c -> if c <> c0 then raise Exit) s;
true
with Exit -> false

let entropy s =
if is_repeated_char s then (* somewhat arbitrary low entropy *) 1.
else entropy_from_trigrams s

let entropy_data s =
let ent = entropy s in
Expand Down
2 changes: 1 addition & 1 deletion semgrep-core/src/engine/Eval_generic.ml
Expand Up @@ -311,7 +311,7 @@ let text_of_binding mvar mval =
(* Note that `text` may be produced by constant folding, in which
* case we will not have range info. *)
Some text
| ___else___ -> (
| _ -> (
let any = MV.mvalue_to_any mval in
match Visitor_AST.range_of_any_opt any with
| None ->
Expand Down
28 changes: 21 additions & 7 deletions semgrep-core/src/engine/Match_search_rules.ml
Expand Up @@ -615,7 +615,7 @@ let matches_of_xpatterns config file_and_more xpatterns =
Find the metavariable value, convert it back to a string, and
run it through an analyzer that returns true if there's a "match".
*)
let analyze_metavar env bindings mvar analyzer =
let analyze_metavar env (bindings : MV.bindings) mvar analyzer =
match List.assoc_opt mvar bindings with
| None ->
error env
Expand All @@ -624,10 +624,24 @@ let analyze_metavar env bindings mvar analyzer =
check your rule"
mvar);
false
| Some mval -> (
match Eval_generic.text_of_binding mvar mval with
| None -> false
| Some data -> analyzer data)
| Some mvalue -> analyzer mvalue

(*
Analyze the contents of a string literal bound to a metavariable.
The analyzer operates of the strings contents after best-effort
unescaping.
Return false if the bound value isn't a string literal.
*)
let analyze_string_metavar env bindings mvar (analyzer : string -> bool) =
analyze_metavar env bindings mvar (function
(* We don't use Eval_generic.text_of_binding on string literals because
it returns the quoted string but we want it unquoted. *)
| E { G.e = G.L (G.String (escaped, _tok)); _ } ->
escaped |> String_literal.approximate_unescape |> analyzer
| other_mval -> (
match Eval_generic.text_of_binding mvar other_mval with
| Some s -> analyzer s
| None -> false))

let rec filter_ranges env xs cond =
xs
Expand Down Expand Up @@ -676,7 +690,7 @@ let rec filter_ranges env xs cond =
Eval_generic.eval_bool env e
| R.CondAnalysis (mvar, CondEntropy) ->
let bindings = r.mvars in
analyze_metavar env bindings mvar Entropy.has_high_score
analyze_string_metavar env bindings mvar Entropy.has_high_score
| R.CondAnalysis (mvar, CondReDoS) ->
let bindings = r.mvars in
let analyze re_str =
Expand All @@ -699,7 +713,7 @@ let rec filter_ranges env xs cond =
mvar re_str;
false
in
analyze_metavar env bindings mvar analyze)
analyze_string_metavar env bindings mvar analyze)

and satisfies_metavar_pattern_condition env r mvar opt_xlang formula =
let bindings = r.mvars in
Expand Down
34 changes: 0 additions & 34 deletions semgrep-core/src/engine/ReDoS.ml
Expand Up @@ -175,39 +175,6 @@ let find_vulnerable_nodes =
(matches_deep matches_in_two_nonempty_branches)))
matches_not_everywhere)

(*
Assume:
\\ -> \
\' -> '
\" -> "
*)
let unescape =
let rex = SPcre.regexp "\\\\[\\\\'\"]" in
fun s ->
Pcre.substitute ~rex
~subst:(fun s ->
assert (String.length s = 2);
String.sub s 1 1)
s

(*
HACK!
Remove the quotes from string literals because string literals are
what we're getting. The generic AST should offer a view into
string content rather than the original quoted and escaped literals.
*)
let unquote s =
let len = String.length s in
if len < 2 then s
else
let first = s.[0] in
let last = s.[len - 1] in
match (first, last) with
| '\'', '\''
| '"', '"' ->
String.sub s 1 (len - 2) |> unescape
| _ -> s

(* Take the requested substring if possible, otherwise return the original
string. *)
let safe_sub s pos sublen =
Expand All @@ -227,7 +194,6 @@ let recover_source re_str node =
safe_sub re_str first_tok_pos len

let find_vulnerable_subpatterns ?(dialect = Dialect.PCRE) re_str =
let re_str = (* TODO: take an already unquoted string *) unquote re_str in
let conf = Dialect.conf dialect in
match parse_regexp conf re_str with
| None -> Error ()
Expand Down
3 changes: 0 additions & 3 deletions semgrep-core/src/engine/ReDoS.mli
Expand Up @@ -21,6 +21,3 @@ val find_vulnerable_subpatterns :
*)
val regexp_may_be_vulnerable :
?dialect:Pfff_lang_regexp.Dialect.t -> string -> bool

(* Exposed only for testing purposes *)
val unquote : string -> string
16 changes: 16 additions & 0 deletions semgrep-core/src/engine/String_literal.ml
@@ -0,0 +1,16 @@
(* Evaluate the contents of string literals *)

(*
Assume:
\\ -> \
\' -> '
\" -> "
*)
let approximate_unescape =
let rex = SPcre.regexp "\\\\[\\\\'\"]" in
fun s ->
Pcre.substitute ~rex
~subst:(fun s ->
assert (String.length s = 2);
String.sub s 1 1)
s
24 changes: 24 additions & 0 deletions semgrep-core/src/engine/String_literal.mli
@@ -0,0 +1,24 @@
(*
This module is a hack for evaluating string literals.
*)

(*
Evaluate unquoted, escaped string literal contents.

abc\\\'\"def -> abc\'"def

Assumes:

\\ -> \
\' -> '
\" -> "

Other escape sequences are left untouched.

HACK!

Different languages use different escaping rules for their string
literals. This is incorrect but can be useful until each language
parser exposes unescaped strings.
*)
val approximate_unescape : string -> string
39 changes: 18 additions & 21 deletions semgrep-core/src/engine/Unit_ReDoS.ml
Expand Up @@ -4,36 +4,33 @@

open Printf

let quoted_strings =
let escaped_strings =
[
(* input, expected output *)
({||}, {||});
({|a|}, {|a|});
({|'|}, {|'|});
({|"|}, {|"|});
({|\'|}, {|'|});
({|\"|}, {|"|});
({|\\|}, {|\|});
({|ab|}, {|ab|});
({|\\\|}, {|\\\|});
({|"\\\'|}, {|"\\\'|});
({|''|}, {||});
({|""|}, {||});
({|'ab'|}, {|ab|});
({|'a"b'|}, {|a"b|});
({|"a'b"|}, {|a'b|});
({|'a'b'|}, {|a'b|});
({|"a"b"|}, {|a"b|});
({|'a\b'|}, {|a\b|});
({|"a\b"|}, {|a\b|});
({|'a\\b'|}, {|a\b|});
({|"a\\b"|}, {|a\b|});
({|'a\\\b'|}, {|a\\b|});
({|"a\\\b"|}, {|a\\b|});
({|\\\|}, {|\\|});
({|"\\\'|}, {|"\'|});
({|''|}, {|''|});
({|""|}, {|""|});
({|'ab'|}, {|'ab'|});
({|'a"b'|}, {|'a"b'|});
({|"a'b"|}, {|"a'b"|});
]

let test_unquote () =
quoted_strings
let test_unescape () =
escaped_strings
|> List.iter (fun (input, expected_output) ->
let output = ReDoS.unquote input in
Alcotest.(check string) ("unquote " ^ input) expected_output output)
let output = String_literal.approximate_unescape input in
Alcotest.(check string)
("String_literal.approximate_unescape " ^ input)
expected_output output)

type result = Succeeds | Blows_up

Expand Down Expand Up @@ -186,7 +183,7 @@ let test_vulnerability_prediction () =

let tests =
[
("unquote", test_unquote);
("unescape", test_unescape);
("pcre pattern explosion", test_pcre_pattern_explosions);
("vulnerability prediction", test_vulnerability_prediction);
]
2 changes: 2 additions & 0 deletions semgrep-core/src/engine/Unit_entropy.ml
Expand Up @@ -52,6 +52,8 @@ let low_score_strings =
"password123";
"someReallyLongComplicatedObjectiveCMethodName";
"{$variable.someProperty}";
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
(* grey zone *)
"ringerringerringerringerringerringerringerringerringerringerringer";
"Many hands make light work.";
Expand Down
5 changes: 5 additions & 0 deletions semgrep-core/tests/OTHER/rules/entropy_python.py
@@ -0,0 +1,5 @@
# ruleid: entropy-python
secret = 'feoi7n978faseduy987ybf67t4ni867fased6tbi6fa34nf97'

secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
secret = 'secret'
11 changes: 11 additions & 0 deletions semgrep-core/tests/OTHER/rules/entropy_python.yaml
@@ -0,0 +1,11 @@
rules:
- id: entropy-python
patterns:
- pattern: $A = $B
mjambon marked this conversation as resolved.
Show resolved Hide resolved
- metavariable-analysis:
analyzer: entropy
metavariable: $B
message: found high-entropy string
languages:
- python
severity: ERROR
18 changes: 18 additions & 0 deletions semgrep-core/tests/OTHER/rules/redos_python.py
@@ -0,0 +1,18 @@
import re

# ruleid: redos-python
re.match('(a+)+$', ...)

re.match('hello+', ...)

def test_patterns(input_str):
# ruleid: redos-python
vulnerable = re.match('((honk )+)+$', input_str)
ok = re.match('((honk )+)++$', input_str)

def validate_email(email_str):
# ruleid: redos-python
re.match(
'^([a-zA-Z0-9])(([\-.]|[_]+)?([a-zA-Z0-9]+))*(@){1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]{1}[a-z]{2,3}))$',
email_str
)
18 changes: 18 additions & 0 deletions semgrep-core/tests/OTHER/rules/redos_python.yaml
@@ -0,0 +1,18 @@
rules:
- id: redos-python
patterns:
- pattern: re.match($PAT, ...)
mjambon marked this conversation as resolved.
Show resolved Hide resolved
mjambon marked this conversation as resolved.
Show resolved Hide resolved
- metavariable-analysis:
analyzer: redos
metavariable: $PAT
message: |
Found a regex potentially vulnerable to regex denial-of-service
attacks (ReDoS). Consider the use of possessive quantifiers to
disable backtracking where it isn't necessary e.g. instead of '[a-z]*',
write '[a-z]*+'.
metadata:
references:
- https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
languages:
- python
severity: ERROR