From c93a25b372dac0e51dd04ca1d2de34bb81225bb4 Mon Sep 17 00:00:00 2001 From: pad Date: Mon, 30 May 2022 15:21:23 +0200 Subject: [PATCH] Introduce Type_generic.ml This refactors m_compatible_type to go in 2 steps: one step to extract the type, and another for the match This will help #5350 test plan: make test --- semgrep-core/src/core/ast/Type_generic.ml | 68 +++++++++++++++++ .../synthesizing/Unit_synthesizer.ml | 2 +- .../src/matching/Apply_equivalences.ml | 7 +- .../src/matching/Apply_equivalences.mli | 2 +- .../src/matching/Generic_vs_generic.ml | 73 +++++++++---------- semgrep-core/src/matching/Match_patterns.ml | 14 ++-- semgrep-core/src/matching/Matching_generic.ml | 12 +-- .../src/matching/Matching_generic.mli | 6 +- semgrep-core/src/matching/Unit_matcher.ml | 2 +- 9 files changed, 121 insertions(+), 65 deletions(-) create mode 100644 semgrep-core/src/core/ast/Type_generic.ml diff --git a/semgrep-core/src/core/ast/Type_generic.ml b/semgrep-core/src/core/ast/Type_generic.ml new file mode 100644 index 000000000000..1b9af0f8f96e --- /dev/null +++ b/semgrep-core/src/core/ast/Type_generic.ml @@ -0,0 +1,68 @@ +(* Yoann Padioleau + * + * Copyright (C) 2022 r2c + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation, with the + * special exception on linking described in file license.txt. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file + * license.txt for more details. + *) +module G = AST_generic + +(*****************************************************************************) +(* Prelude *) +(*****************************************************************************) +(* Common representation for (builtin) types across languages. + * + * In certain languages, a boolean declaration is using the type name + * "boolean", in another language "Bool", in yet another "bool". + * This is tedious to handle all those specifities during matching, so + * the goal of this file is to factorize those specifities here. + * + * alt: + * - store this directly in AST_generic.ml and do the work in the + * xxx_to_generic.ml + * + * related: + * - Type.ml in deep-semgrep + *) + +(*****************************************************************************) +(* Types *) +(*****************************************************************************) + +(* + * coupling: this is mostly used for typed metavariables like '($X: int)' to + * match literals like 'f(1)', so most of the types below should have + * a corresponding construct in AST_generic.literal + *) +type builtin_type = TInt | TNumber | TString | TFloat [@@deriving show] + +(*****************************************************************************) +(* Helpers *) +(*****************************************************************************) + +(* less: should sanity check things by looking at [lang] *) +let builtin_type_of_ident _langTODO str = + match str with + | "int" -> Some TInt + | "float" -> Some TFloat + | "str" + | "string" + | "String" -> + Some TString + | _ -> None + +let builtin_type_of_type lang t = + match t.G.t with + (* for Python literal checking *) + | G.TyExpr { e = G.N (G.Id ((str, _t), _idinfo)); _ } -> + builtin_type_of_ident lang str + (* for Java/Go/... literals *) + | G.TyN (Id ((str, _t), _idinfo)) -> builtin_type_of_ident lang str + | _ -> None diff --git a/semgrep-core/src/experiments/synthesizing/Unit_synthesizer.ml b/semgrep-core/src/experiments/synthesizing/Unit_synthesizer.ml index e83ebbf217c0..3ce81276d452 100644 --- a/semgrep-core/src/experiments/synthesizing/Unit_synthesizer.ml +++ b/semgrep-core/src/experiments/synthesizing/Unit_synthesizer.ml @@ -217,7 +217,7 @@ let tests = in let matches_with_env = let env = - Matching_generic.empty_environment None None + Matching_generic.empty_environment None lang Config_semgrep.default_config in Match_patterns.match_any_any pattern code env diff --git a/semgrep-core/src/matching/Apply_equivalences.ml b/semgrep-core/src/matching/Apply_equivalences.ml index 1dc190e91e9d..b9bfa87dd933 100644 --- a/semgrep-core/src/matching/Apply_equivalences.ml +++ b/semgrep-core/src/matching/Apply_equivalences.ml @@ -26,7 +26,7 @@ module Env = Metavariable_capture (* Matchers for code equivalence mode *) (*****************************************************************************) -let match_e_e_for_equivalences _ruleid a b = +let match_e_e_for_equivalences _ruleid lang a b = Common.save_excursion Flag.equivalence_mode true (fun () -> let config = { @@ -36,7 +36,6 @@ let match_e_e_for_equivalences _ruleid a b = } in let cache = None in - let lang = None in let env = Matching_generic.empty_environment cache lang config in Generic_vs_generic.m_expr a b env) @@ -71,7 +70,7 @@ let subst_e (env : Env.t) e = in visitor.M.vexpr e -let apply equivs any = +let apply equivs lang any = let expr_rules = ref [] in let stmt_rules = ref [] in @@ -106,7 +105,7 @@ let apply equivs any = | (l, r) :: xs -> ( (* look for a match on original x, not x' *) let matches_with_env = - match_e_e_for_equivalences "" l x + match_e_e_for_equivalences "" lang l x in match matches_with_env with (* todo: should generate a Disj for each possibilities? *) diff --git a/semgrep-core/src/matching/Apply_equivalences.mli b/semgrep-core/src/matching/Apply_equivalences.mli index 7ab07abba616..0b1ce37b0d7a 100644 --- a/semgrep-core/src/matching/Apply_equivalences.mli +++ b/semgrep-core/src/matching/Apply_equivalences.mli @@ -1 +1 @@ -val apply : Equivalence.t list -> Pattern.t -> Pattern.t +val apply : Equivalence.t list -> Lang.t -> Pattern.t -> Pattern.t diff --git a/semgrep-core/src/matching/Generic_vs_generic.ml b/semgrep-core/src/matching/Generic_vs_generic.ml index 0d4c16958371..9483993e3681 100644 --- a/semgrep-core/src/matching/Generic_vs_generic.ml +++ b/semgrep-core/src/matching/Generic_vs_generic.ml @@ -31,6 +31,7 @@ module MV = Metavariable module Flag = Flag_semgrep module Config = Config_semgrep_t module H = AST_generic_helpers +module T = Type_generic (* optimisations *) module CK = Caching.Cache_key @@ -724,7 +725,7 @@ and m_expr a b = envf (str, tok) (MV.E b) (* metavar: typed! *) | G.TypedMetavar ((str, tok), _, t), _b when MV.is_metavar_name str -> - m_compatible_type (str, tok) t b + with_lang (fun lang -> m_compatible_type lang (str, tok) t b) (* dots: should be patterned-match before in arguments, or statements, * but this is useful for keyword parameters, as in f(..., foo=..., ...) *) @@ -743,7 +744,7 @@ and m_expr a b = (with_lang (fun lang -> match Constant_propagation.constant_propagation_and_evaluate_literal - ?lang b + ~lang b with | Some b1 -> m_literal_svalue a1 b1 | None -> fail ())) @@ -1185,43 +1186,41 @@ and m_container_ordered_elements a b = * which would require to transform the code in the generic_vs_generic * style as typechecking could also bind metavariables in the process *) -and m_compatible_type typed_mvar t e = - match (t.G.t, e.G.e) with - (* for Python literal checking *) - | G.TyExpr { e = G.N (G.Id (("int", _tok), _idinfo)); _ }, B.L (B.Int _) -> - envf typed_mvar (MV.E e) - | G.TyExpr { e = G.N (G.Id (("float", _tok), _idinfo)); _ }, B.L (B.Float _) - -> - envf typed_mvar (MV.E e) - | G.TyExpr { e = G.N (G.Id (("str", _tok), _idinfo)); _ }, B.L (B.String _) -> - envf typed_mvar (MV.E e) - (* for C specific literals *) - | G.TyPointer (_, { t = TyN (G.Id (("char", _), _)); _ }), B.L (B.String _) -> - envf typed_mvar (MV.E e) - | G.TyPointer (_, _), B.L (B.Null _) -> envf typed_mvar (MV.E e) - (* for Java and Go literals *) - | G.TyN (Id (("int", _), _)), B.L (B.Int _) -> envf typed_mvar (MV.E e) - | G.TyN (Id (("float", _), _)), B.L (B.Float _) -> envf typed_mvar (MV.E e) - | G.TyN (Id ((("string" | "String"), _), _)), B.L (B.String _) -> - envf typed_mvar (MV.E e) - (* for C strings to match metavariable pointer types *) - | ( G.TyPointer (t1, { t = G.TyN (G.Id ((_, tok), id_info)); _ }), - B.L (B.String _) ) -> - m_type_ t - (G.TyPointer (t1, G.TyN (G.Id (("char", tok), id_info)) |> G.t) |> G.t) - >>= fun () -> envf typed_mvar (MV.E e) - (* for matching ids *) - (* this is covered by the basic type propagation done in Naming_AST.ml *) - | _ta, B.N (B.Id (idb, ({ B.id_type = tb; _ } as id_infob))) -> - (* NOTE: Name values must be represented with MV.Id! *) - m_type_option_with_hook idb (Some t) !tb >>= fun () -> - envf typed_mvar (MV.Id (idb, Some id_infob)) - | _ta, _eb -> ( - match type_of_expr e with - | Some (idb, tb) -> - m_type_option_with_hook idb (Some t) tb >>= fun () -> +and m_compatible_type lang typed_mvar t e = + match (Type_generic.builtin_type_of_type lang t, e.G.e) with + | Some builtin, B.L lit -> ( + match (builtin, lit) with + | T.TInt, B.Int _ + | T.TFloat, B.Float _ + | T.TString, B.String _ -> envf typed_mvar (MV.E e) | _ -> fail ()) + | _ -> ( + match (t.G.t, e.G.e) with + (* for C specific literals *) + | ( G.TyPointer (_, { t = TyN (G.Id (("char", _), _)); _ }), + B.L (B.String _) ) -> + envf typed_mvar (MV.E e) + | G.TyPointer (_, _), B.L (B.Null _) -> envf typed_mvar (MV.E e) + (* for C strings to match metavariable pointer types *) + | ( G.TyPointer (t1, { t = G.TyN (G.Id ((_, tok), id_info)); _ }), + B.L (B.String _) ) -> + m_type_ t + (G.TyPointer (t1, G.TyN (G.Id (("char", tok), id_info)) |> G.t) + |> G.t) + >>= fun () -> envf typed_mvar (MV.E e) + (* for matching ids *) + (* this is covered by the basic type propagation done in Naming_AST.ml *) + | _ta, B.N (B.Id (idb, ({ B.id_type = tb; _ } as id_infob))) -> + (* NOTE: Name values must be represented with MV.Id! *) + m_type_option_with_hook idb (Some t) !tb >>= fun () -> + envf typed_mvar (MV.Id (idb, Some id_infob)) + | _ta, _eb -> ( + match type_of_expr e with + | Some (idb, tb) -> + m_type_option_with_hook idb (Some t) tb >>= fun () -> + envf typed_mvar (MV.E e) + | _ -> fail ())) (* returns a type option and an ident that can be used to query LSP *) and type_of_expr e : (G.ident * G.type_ option) option = diff --git a/semgrep-core/src/matching/Match_patterns.ml b/semgrep-core/src/matching/Match_patterns.ml index aeb8130e21a3..a07a29f4d881 100644 --- a/semgrep-core/src/matching/Match_patterns.ml +++ b/semgrep-core/src/matching/Match_patterns.ml @@ -142,7 +142,7 @@ let match_rules_and_recurse lang config (file, hook, matches) rules matcher k any x = rules |> List.iter (fun (pattern, rule, cache) -> - let env = MG.empty_environment cache (Some lang) config in + let env = MG.empty_environment cache lang config in let matches_with_env = matcher rule pattern x env in if matches_with_env <> [] then (* Found a match *) @@ -233,7 +233,7 @@ let check2 ~hook range_filter (config, equivs) rules (file, lang, ast) = |> List.iter (fun rule -> (* less: normalize the pattern? *) let any = rule.MR.pattern in - let any = Apply_equivalences.apply equivs any in + let any = Apply_equivalences.apply equivs lang any in let cache = if !Flag.with_opt_cache then Some (Caching.Cache.create ()) else None @@ -278,9 +278,7 @@ let check2 ~hook range_filter (config, equivs) rules (file, lang, ast) = (show_expr_kind x.e); () | Some range_loc when range_filter range_loc -> - let env = - MG.empty_environment cache (Some lang) config - in + let env = MG.empty_environment cache lang config in let matches_with_env = match_e_e rule pattern x env in if matches_with_env <> [] then (* Found a match *) @@ -316,7 +314,7 @@ let check2 ~hook range_filter (config, equivs) rules (file, lang, ast) = let visit_stmt () = !stmt_rules |> List.iter (fun (pattern, _pattern_strs, rule, cache) -> - let env = MG.empty_environment cache (Some lang) config in + let env = MG.empty_environment cache lang config in let matches_with_env = match_st_st rule pattern x env in if matches_with_env <> [] then (* Found a match *) @@ -377,9 +375,7 @@ let check2 ~hook range_filter (config, equivs) rules (file, lang, ast) = !stmts_rules |> List.iter (fun (pattern, _pattern_strs, rule, cache) -> Common.profile_code "Semgrep_generic.kstmts" (fun () -> - let env = - MG.empty_environment cache (Some lang) config - in + let env = MG.empty_environment cache lang config in let matches_with_env = match_sts_sts rule pattern x env in diff --git a/semgrep-core/src/matching/Matching_generic.ml b/semgrep-core/src/matching/Matching_generic.ml index de2e369dd97a..83cb3547c113 100644 --- a/semgrep-core/src/matching/Matching_generic.ml +++ b/semgrep-core/src/matching/Matching_generic.ml @@ -79,7 +79,7 @@ type tin = { stmts_match_span : Stmts_match_span.t; cache : tout Caching.Cache.t option; (* TODO: this does not have to be in tout; maybe split tin in 2? *) - lang : Lang.t option; + lang : Lang.t; config : Config_semgrep.t; } @@ -342,14 +342,8 @@ let (envf : MV.mvar G.wrap -> MV.mvalue -> tin -> tout) = (lazy (spf "envf: success, %s (%s)" mvar (MV.str_of_mval any))); return new_binding -let empty_environment opt_cache opt_lang config = - { - mv = Env.empty; - stmts_match_span = Empty; - cache = opt_cache; - lang = opt_lang; - config; - } +let empty_environment opt_cache lang config = + { mv = Env.empty; stmts_match_span = Empty; cache = opt_cache; lang; config } (*****************************************************************************) (* Helpers *) diff --git a/semgrep-core/src/matching/Matching_generic.mli b/semgrep-core/src/matching/Matching_generic.mli index 906f4b41fbc7..99898d58bbd9 100644 --- a/semgrep-core/src/matching/Matching_generic.mli +++ b/semgrep-core/src/matching/Matching_generic.mli @@ -6,7 +6,7 @@ type tin = { stmts_match_span : Stmts_match_span.t; cache : tout Caching.Cache.t option; (* TODO: this does not have to be in tout; maybe split tin in 2? *) - lang : Lang.t option; + lang : Lang.t; config : Config_semgrep.t; } @@ -47,7 +47,7 @@ val or_list : 'a matcher -> 'a -> 'a list -> tin -> tout val ( let* ) : (tin -> tout) -> (unit -> tin -> tout) -> tin -> tout val empty_environment : - tout Caching.Cache.t option -> Lang.t option -> Config_semgrep.t -> tin + tout Caching.Cache.t option -> Lang.t -> Config_semgrep.t -> tin val add_mv_capture : Metavariable.mvar -> Metavariable.mvalue -> tin -> tin @@ -65,7 +65,7 @@ val if_config : tin -> tout -val with_lang : (Lang.t option -> tin -> 'a) -> tin -> 'a +val with_lang : (Lang.t -> tin -> 'a) -> tin -> 'a val check_and_add_metavar_binding : Metavariable.mvar * Metavariable.mvalue -> tin -> tin option diff --git a/semgrep-core/src/matching/Unit_matcher.ml b/semgrep-core/src/matching/Unit_matcher.ml index 99b9d136c192..2e2ae8b9cfda 100644 --- a/semgrep-core/src/matching/Unit_matcher.ml +++ b/semgrep-core/src/matching/Unit_matcher.ml @@ -119,7 +119,7 @@ let tests ~any_gen_of_string = let pattern = any_gen_of_string spattern in let code = any_gen_of_string scode in let cache = None in - let lang = None in + let lang = Lang.Python in let config = Config_semgrep.default_config in let env = Matching_generic.empty_environment cache lang config