Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tainting: Add new rule options #6327

Merged
merged 4 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions changelog.d/pa-1541.added
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
taint-mode: Added two new rule `options` that help minimizing false positives.

First one is `taint_assume_safe_indexes`, which makes Semgrep assume that an
array-access expression is safe even if the index expression is tainted. Otherwise
Semgrep assumes that e.g. `a[i]` is tainted if `i` is tainted, even if `a` is not.
Enabling this option is recommended for high-signal rules, whereas disabling it
may be preferred for audit rules. Currently, it is disabled by default for pure
backwards compatibility reasons, but this may change in the near future after some
evaluation.

The other one is `taint_assume_safe_functions`, which makes Semgrep assume that
function calls do *NOT* propagate taint from their arguments to their output.
Otherwise, Semgrep always assumes that functions may propagate taint. This is
intended to replace _not conflicting_ sanitizers (added in v0.69.0) in the future.
This option is still experimental and needs to be complemented by other changes
to be made in future releases.
4 changes: 4 additions & 0 deletions interfaces/Config_semgrep.atd
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ type t = {
~constant_propagation <ocaml default="true"> : bool;
(* symbolic_propagation requires constant_propagation to have effect *)
~symbolic_propagation <ocaml default="false"> : bool;

(* metavariables common to a source and sink will be unified *)
~taint_unify_mvars <ocaml default="false"> : bool;
~taint_assume_safe_functions <ocaml default="false"> : bool;
~taint_assume_safe_indexes <ocaml default="false"> : bool;

(* 'ac' stands for associative-commutative matching *)
~ac_matching <ocaml default="true"> : bool;
(* pretend && and || are commutative *)
Expand Down
2 changes: 1 addition & 1 deletion semgrep-core/src/engine/Match_env.ml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ type id_to_match_results = (pattern_id, Pattern_match.t) Hashtbl.t
* they are not exposed to the user anymore.
*)
type xconfig = {
config : Config_semgrep.t;
config : Config_semgrep.t; (* corresponds to rule `options` key *)
equivs : Equivalence.equivalences;
(* Fields coming from Runner_config.t used by the engine.
* We could just include the whole Runner_config.t, but it's
Expand Down
3 changes: 3 additions & 0 deletions semgrep-core/src/engine/Match_rules.ml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ let check ~match_hook ~timeout ~timeout_threshold (xconf : Match_env.xconfig)
let res_rules, skipped_rules =
rules
|> Common.partition_either (fun r ->
let xconf =
Match_env.adjust_xconfig_with_rule_options xconf r.R.options
in
let relevant_rule =
if xconf.filter_irrelevant_rules then (
match Analyze_rule.regexp_prefilter_of_rule r with
Expand Down
1 change: 0 additions & 1 deletion semgrep-core/src/engine/Match_search_mode.ml
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,6 @@ and matches_of_formula xconf rule xtarget formula opt_context :
(*****************************************************************************)

let check_rule ({ R.mode = `Search formula; _ } as r) hook xconf xtarget =
let xconf = Match_env.adjust_xconfig_with_rule_options xconf r.R.options in
let rule_id = fst r.id in
let res, final_ranges = matches_of_formula xconf r xtarget formula None in
let errors = res.errors |> Report.ErrorSet.map (error_with_rule_id rule_id) in
Expand Down
8 changes: 4 additions & 4 deletions semgrep-core/src/engine/Match_tainting_mode.ml
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ let pm_of_finding finding =
| T.ArgToReturn _ ->
None

let check_fundef lang taint_config opt_ent fdef =
let check_fundef lang options taint_config opt_ent fdef =
let name =
let* ent = opt_ent in
let* name = AST_to_IL.name_of_entity ent in
Expand Down Expand Up @@ -551,7 +551,7 @@ let check_fundef lang taint_config opt_ent fdef =
in
let _, xs = AST_to_IL.function_definition lang fdef in
let flow = CFG_build.cfg_of_stmts xs in
Dataflow_tainting.fixpoint ~in_env ?name taint_config flow |> ignore
Dataflow_tainting.fixpoint ~in_env ?name options taint_config flow |> ignore

let check_rule (rule : R.taint_rule) match_hook (xconf : Match_env.xconfig)
(xtarget : Xtarget.t) =
Expand Down Expand Up @@ -579,7 +579,7 @@ let check_rule (rule : R.taint_rule) match_hook (xconf : Match_env.xconfig)
in

(* Check each function definition. *)
Visit_function_defs.visit (check_fundef lang taint_config) ast;
Visit_function_defs.visit (check_fundef lang xconf.config taint_config) ast;

(* Check the top-level statements.
* In scripting languages it is not unusual to write code outside
Expand All @@ -589,7 +589,7 @@ let check_rule (rule : R.taint_rule) match_hook (xconf : Match_env.xconfig)
Common.with_time (fun () ->
let xs = AST_to_IL.stmt lang (G.stmt1 ast) in
let flow = CFG_build.cfg_of_stmts xs in
Dataflow_tainting.fixpoint taint_config flow |> ignore)
Dataflow_tainting.fixpoint xconf.config taint_config flow |> ignore)
in
let matches =
!matches
Expand Down
7 changes: 4 additions & 3 deletions semgrep-core/src/engine/Test_dataflow_tainting.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ let pr2_ranges file rwms =
in
Common.pr2 (code_text ^ " @l." ^ line_str))

let test_tainting lang file config def =
let test_tainting lang file options config def =
let xs = AST_to_IL.stmt lang (H.funcbody_to_stmt def.fbody) in
let flow = CFG_build.cfg_of_stmts xs in

Common.pr2 "\nDataflow";
Common.pr2 "--------";
let mapping = Dataflow_tainting.fixpoint config flow in
let mapping = Dataflow_tainting.fixpoint options config flow in
let taint_to_str taint =
let show_taint t =
match t.Taint.orig with
Expand Down Expand Up @@ -73,6 +73,7 @@ let test_dfg_tainting rules_file file =
pr2 "========";
let handle_findings _ _ _ = () in
let xconf = Match_env.default_xconfig in
let xconf = Match_env.adjust_xconfig_with_rule_options xconf rule.options in
let config, debug_taint, _exps =
Match_tainting_mode.taint_config_of_rule xconf file (ast, []) rule
handle_findings
Expand All @@ -92,7 +93,7 @@ let test_dfg_tainting rules_file file =
V.default_visitor with
V.kfunction_definition =
(fun (k, _v) def ->
test_tainting lang file config def;
test_tainting lang file xconf.config config def;
(* go into nested functions *)
k def);
}
Expand Down
44 changes: 34 additions & 10 deletions semgrep-core/src/tainting/Dataflow_tainting.ml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ type mapping = Lval_env.t D.mapping

(* HACK: Tracks tainted functions intrafile. *)
type fun_env = (var, Taints.t) Hashtbl.t
type env = { config : config; fun_name : var option; lval_env : Lval_env.t }

type env = {
options : Config_semgrep.t; (* rule options *)
config : config;
fun_name : var option;
lval_env : Lval_env.t;
}

(*****************************************************************************)
(* Hooks *)
Expand Down Expand Up @@ -468,7 +474,14 @@ let rec check_tainted_expr env exp : Taints.t * Lval_env.t =
| Mem e -> check env e
in
let check_offset env = function
| Index e -> check env e
| Index e ->
let taints, lval_env = check env e in
let taints =
if not env.options.taint_assume_safe_indexes then taints
else (* Taint from the index should be ignored. *)
Taints.empty
in
(taints, lval_env)
| Dot fld -> check_tainted_tok env (snd fld.ident)
in
let check_subexpr exp =
Expand Down Expand Up @@ -607,9 +620,14 @@ let check_tainted_instr env instr : Taints.t * Lval_env.t =
match check_function_signature env e args_taints with
| Some call_taints -> call_taints
| None ->
(* Default is to assume that the function will propagate
* the taint of its arguments. *)
List.fold_left Taints.union e_taints all_taints
if env.options.taint_assume_safe_functions then
(* If we asume that function calls are "safe", then taints from
* the arguments are not present in the function's output. *)
e_taints
else
(* Otherwise assume that the function will propagate
* the taint of its arguments. *)
List.fold_left Taints.union e_taints all_taints
in
(call_taints, lval_env)
| CallSpecial (_, _, args) ->
Expand Down Expand Up @@ -674,19 +692,20 @@ let input_env ~enter_env ~(flow : F.cfg) mapping ni =
| penv1 :: penvs -> List.fold_left Lval_env.union penv1 penvs)

let (transfer :
Config_semgrep.t ->
config ->
Lval_env.t ->
string option ->
flow:F.cfg ->
Lval_env.t D.transfn) =
fun config enter_env opt_name ~flow
fun options config enter_env opt_name ~flow
(* the transfer function to update the mapping at node index ni *)
mapping ni ->
(* DataflowX.display_mapping flow mapping show_tainted; *)
let in' : Lval_env.t = input_env ~enter_env ~flow mapping ni in
let node = flow.graph#nodes#assoc ni in
let out' : Lval_env.t =
let env = { config; fun_name = opt_name; lval_env = in' } in
let env = { options; config; fun_name = opt_name; lval_env = in' } in
match node.F.n with
| NInstr x -> (
let taints, lval_env' = check_tainted_instr env x in
Expand Down Expand Up @@ -753,8 +772,13 @@ let (transfer :
(*****************************************************************************)

let (fixpoint :
?in_env:Lval_env.t -> ?name:Var_env.var -> config -> F.cfg -> mapping) =
fun ?in_env ?name:opt_name config flow ->
?in_env:Lval_env.t ->
?name:Var_env.var ->
Config_semgrep.t ->
config ->
F.cfg ->
mapping) =
fun ?in_env ?name:opt_name options config flow ->
let init_mapping = DataflowX.new_node_array flow Lval_env.empty_inout in
let enter_env =
match in_env with
Expand All @@ -764,6 +788,6 @@ let (fixpoint :
(* THINK: Why I cannot just update mapping here ? if I do, the mapping gets overwritten later on! *)
(* DataflowX.display_mapping flow init_mapping show_tainted; *)
DataflowX.fixpoint ~eq_env:Lval_env.equal ~init:init_mapping
~trans:(transfer config enter_env opt_name ~flow)
~trans:(transfer options config enter_env opt_name ~flow)
(* tainting is a forward analysis! *)
~forward:true ~flow
7 changes: 6 additions & 1 deletion semgrep-core/src/tainting/Dataflow_tainting.mli
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,12 @@ val hook_function_taint_signature :
(** Deep Semgrep *)

val fixpoint :
?in_env:Taint_lval_env.t -> ?name:var -> config -> IL.cfg -> mapping
?in_env:Taint_lval_env.t ->
?name:var ->
Config_semgrep.t ->
config ->
IL.cfg ->
mapping
(** Main entry point, [fixpoint config cfg] returns a mapping (effectively a set)
* containing all the tainted variables in [cfg]. Besides, if it infers any taint
* 'findings', it will invoke [config.handle_findings] which can perform any
Expand Down
23 changes: 23 additions & 0 deletions semgrep-core/tests/rules/taint_assume_safe_funcs.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?php

function not_tainted($data) {
// do stuff
return '2';
}

//Problem here is that `tainted('a')` is in itself considered a sink because
//currently taint-mode considers anything that falls within a sink-range to be
//a sink... and the sink range here is whatever it matches `sink(...)`!
//todook:tainted
sink(not_tainted(tainted('a')));

$ok = not_tainted(tainted('a'));
//OK:
sink($ok);

//ruleid:tainted
sink(tainted('b'));

$bad = tainted('b');
//ruleid:tainted
sink($bad);
14 changes: 14 additions & 0 deletions semgrep-core/tests/rules/taint_assume_safe_funcs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rules:
- id: tainted
languages:
- php
message: Match
mode: taint
options:
taint_assume_safe_functions: true
pattern-sinks:
- pattern: sink(...)
pattern-sources:
- pattern: tainted(...)
severity: WARNING

5 changes: 5 additions & 0 deletions semgrep-core/tests/rules/taint_assume_safe_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
a = safe()
i = tainted()
x = a[i]
#ok:tainted
sink(x)
14 changes: 14 additions & 0 deletions semgrep-core/tests/rules/taint_assume_safe_indexes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rules:
- id: tainted
languages:
- python
message: Match
mode: taint
options:
taint_assume_safe_indexes: true
pattern-sinks:
- pattern: sink(...)
pattern-sources:
- pattern: tainted(...)
severity: WARNING

Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ function not_tainted($data) {
return '2';
}

//OK:
//OK:tainted
sink(not_tainted(tainted('a')));

$ok = not_tainted(tainted('a'));
//OK:
//OK:tainted
sink($ok);

//ERROR:
//ruleid:tainted
sink(tainted('b'));

$bad = tainted('b');
//ERROR:
//ruleid:tainted
sink($bad);
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@ function not_tainted($data) {
return '2';
}

//OK:
//OK:tainted
sink(not_tainted(tainted('a')));

$ok = not_tainted(tainted('a'));
//OK:
//OK:tainted
sink($ok);

// $F(...) sanitizes *everything*
//OK:
//OK:tainted
sink(tainted('b'));

$bad = tainted('b');
// $F(...) sanitizes *everyting*
//OK:
//OK:tainted
sink($bad);