/
AST_generic.ml
1971 lines (1810 loc) · 76.7 KB
/
AST_generic.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(* Yoann Padioleau
*
* Copyright (C) 2019-2022 r2c
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation, with the
* special exception on linking described in file license.txt.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
* license.txt for more details.
*)
(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(* A generic AST, to factorize similar "analysis" in different programming
* languages (e.g., naming, semantic code highlighting, semgrep!).
*
* Right now this generic AST is mostly the factorized union of:
* - Python, Ruby, Lua
* - Javascript, Typescript
* - PHP, Hack
* - Java, CSharp, Kotlin
* - C, C++
* - Go
* - JSON, YAML, HCL
* - OCaml, Scala, Rust
* - TODO Bash, SQL, Docker
*
* rational: In the end, programming languages have a lot in Common.
* Even though some interesting analysis are probably better done on a
* per-language basis, many analysis are simple and require just an
* AST and a visitor. One could duplicate those analysis for each language
* or design an AST (this file) generic enough to factorize all those
* analysis (e.g., unused entity). Note that we want to remain
* as precise as possible and not lose too much information while going
* from the specific language AST to the generic AST. We don't want
* to be too generic as in ast_fuzzy.ml, where we have a very general
* tree of nodes, but all the structure of the original AST is lost.
*
* The generic AST tries to be as close as possible to the original code but
* not too close. When a programming language feature is really sugar or
* an alternative way to do a thing, we usually unsugar. Here are the
* simplifications done:
* - we do not keep the comma tokens in arguments. More generally we
* just keep the tokens to get the range right (see the discussions on
* invariants below) and get rid of the other (e.g., we remove
* parens around conditions in if). We keep the parens for 'Call'
* because we want to get the right range for those so we need the
* rightmost tokens in the AST.
* - multiple var declarations in one declaration (e.g., int a,b; in C)
* are expanded in multiple 'variable_definition'. Note that
* tuple assignments (e.g., a,b=1,2) are not expanded in multiple assigns
* because this is not always possible (e.g., a,b=foo()) and people may
* want to explicitely match tuples assignments (we do some magic in
* Generic_vs_generic though to let 'a=1' matches also 'a,b=1,2').
* - multiple ways to define a function are converted all to a
* 'function_definition' (e.g., Javascript arrows are converted in that)
* update: but we now have a more precise function_body type
* - we are more general and impose less restrictions on where certain
* constructs can appear to simplify things.
* * there is no special lhs/lvalue type (see IL.ml for that) and so
* 'Assign' takes a general 'expr' on its lhs.
* * there is no special toplevel/item vs stmt. Certain programming
* languages impose restrictions on where a function or directive can
* appear (e.g., just at the toplevel), but we allow those constructs
* at the stmt level.
* * the Splat and HashSplat operator can usually appear just in arguments
* or inside arrays or in struct definitions (a field) but we are more
* general and put it at the 'expr' level.
* * certain attributes are valid only for certain constructs but instead
* we use one attribute type (no class_attribute vs func_attribute etc.)
*
* Note that this generic AST has become gradually more and more a
* generic CST, to fix issues in autofix in Semgrep.
* TODO? it may be time to rename this file CST_generic.ml
*
* todo:
* - improve things for Kotlin/Scala/Rust/C++/Java
* - see ast_fuzzy.ml todos for ideas to use AST_generic for sgrep?
*
* related work:
* - ast_fuzzy.ml (in pfff)
* - github semantic (seems dead)
* https://github.com/github/semantic
* - UAST of babelfish
* https://doc.bblf.sh/uast/uast-specification-v2.html
* - Coverity common program representation?
* - Semmle internal common representation?
* - Sonarcube generic language
* https://github.com/SonarSource/slang
* - Facebook Infer SIL (for C++, Java, Objective-C)
* - Dawson Engler and Fraser Brown micro-checkers for multiple languages
* - Comby common representation by Rijnard,
* see "Lightweight Multi-language syntax transformation", but does not
* really operate on an AST
* - https://tabnine.com/ which supports multiple languages, but probably
* again does not operate on an AST
* - srcML https://www.srcml.org/doc/srcMLGrammar.html
* but just for C/C++/C#/Java and seems pretty heavy
*
* design choices to have a generic data structure:
* - add some 'a, 'b, 'c around expr/stmt/...
* - data-type a la carte like in github-semantic but IMHO too high-level
* with astronaut-style architecture (too abstract, too advanced features).
* - CURRENT SOLUTION: the OtherXxx strategy used in this file (simple)
* update: actually switch to OtherXxx of todo_kind, even simpler
* - functorize and add some type hole (type tstmt; type texpr; ...),
* todo? not a bad idea if later we want to add type information on each
* expression nodes
*
* history:
* - started with crossproduct of Javascript, Python, PHP, Java, and C
* (and a bit of OCaml) after wanting to port checked_return from Js to
* Python and got the idea to factorize things
*
* INVARIANTS:
* - all the other_xxx types should contain only simple constructors (enums)
* without any parameter. I rely on that to simplify the code
* of the generic mapper and matcher.
* update: prefer todo_kind to those other_xxx types now.
* Same for keyword_attributes.
* - each expression or statement must have at least one token in it
* so that semgrep can track a location (e.g., 'Return of expr option'
* is not enough because with no expr, there is no location information
* for this return, so it must be 'Return of tok * expr option' instead)
* - each expression or statement should ideally have enough tokens in it
* to get its range, so at least the leftmost and rightmost token in
* all constructs, so the Return above should even be
* 'Return of tok * expr option * tok' for the ending semicolon
* (alt: have the range info in each expr/stmt/pattern but big refactoring)
* - to correctly compute a CFG (Control Flow Graph), the stmt type
* should list all constructs that contains other statements and
* try to avoid to use the very generic 'OtherXxx of any'
* - to correctly compute a DFG (Data Flow Graph), and to correctly resolve
* names (see Naming_AST.ml), each constructs that introduce a new
* variable should have a relevant comment 'newvar:'
* - to correctly resolve names, each construct that introduces a new scope
* should have a relevant comment 'newscope:'
* - todo? each language should add the VarDefs that defines the locals
* used in a function (instead of having the first Assign play the role
* of a VarDef, as done in Python for example).
*)
(* Provide hash_* and hash_fold_* for the core ocaml types *)
open Ppx_hash_lib.Std.Hash.Builtin
(* ppx_hash refuses to hash mutable fields but we do it anyway. *)
let hash_fold_ref hash_fold_x acc x = hash_fold_x acc !x
(*****************************************************************************)
(* Token (leaf) *)
(*****************************************************************************)
(* Contains among other things the position of the token through
* the Parse_info.token_location embedded inside it, as well as the
* transformation field that makes possible spatch on the code.
*)
type tok = Parse_info.t [@@deriving show]
(* sgrep: we do not care about position when comparing for equality 2 ASTs.
* related: Lib_AST.abstract_position_info_any and then use OCaml generic '='.
*)
let equal_tok _t1 _t2 = true
let hash_tok _t = 0
let hash_fold_tok acc _t = acc
(* a shortcut to annotate some information with position information *)
type 'a wrap = 'a * tok [@@deriving show, eq, hash]
(* Use for round(), square[], curly{}, and angle<> brackets.
* note: in theory we should not care about those tokens in an AST,
* but they are useful to report correct ranges in sgrep when we match
* something that can just be those brackets (e.g., an empty container).
*)
type 'a bracket = tok * 'a * tok [@@deriving show, eq, hash]
(* semicolon, a FakeTok in languages that do not require them (e.g., Python).
* alt: tok option.
* See the sc value also at the end of this file to build an sc.
*)
type sc = tok [@@deriving show, eq, hash]
(* an AST element not yet handled.
* history: I started by having some precise OtherXxx of other_xxx
* constructors and types to record what was not handled
* (e.g., OE_Delete, OE_Define, etc.), but it was quickly getting tedious
* each time to add new constructs. In fact, in the language-specific
* ASTs I started to use also some Todo constructs, so I switched to a
* more general todo_kind in the generic AST too. Anyway, we were
* not doing anything with the precise information. If something
* is important and require some semantic equivalence in semgrep, then
* we should support the construct directly, not via an other_xxx.
*)
type todo_kind = string wrap [@@deriving show, eq, hash]
(*****************************************************************************)
(* Names *)
(*****************************************************************************)
type ident = string wrap [@@deriving show, eq, hash]
(* Usually separated by a '.', but can be used also with '::' separators.
* less: we often need to get the last elt or adjust the qualifier part,
* so maybe we should define it as = ident list * ident
*)
type dotted_ident = ident list (* at least 1 element *)
[@@deriving show, eq, hash]
(* module_name can also be used for a package name or a namespace.
* TODO? prefix with M and add MQualifiedName for C++?
* less: can even be dynamic in C with #include of expr
*)
type module_name =
| DottedName of dotted_ident (* ex: Python *)
(* in FileName the '/' is similar to the '.' in DottedName *)
| FileName of string wrap (* ex: Js import, C #include, Go import *)
[@@deriving show { with_path = false }, eq, hash]
(* A single unique id: sid (uid would be a better name, but it usually
* means "user id" for people).
*
* This single id simplifies further analysis which need less to care about
* maintaining scoping information, for example to deal with variable
* shadowing, or functions using the same parameter names
* (even though you still need to handle specially recursive functions), etc.
*
* See Naming_AST.ml for more information.
*
* Most generic ASTs have a fake value (sid_TODO = -1) at first.
* You need to call Naming_AST.resolve (or one of the lang-specific
* Resolve_xxx.resolve) on the generic AST to set it correctly.
*)
(* a single unique gensym'ed number. See gensym() below *)
type sid = int
and resolved_name = resolved_name_kind * sid
and resolved_name_kind =
(* Global is useful in codemap/efuns to highlight differently and warn
* about the use of globals inside functions.
* old: Global was merged with ImportedEntity before but simpler to split, as
* anyway I was putting often an empty list for dotted_ident with a
* todo note in the code.
*)
| Global
(* Those could be merged, but again this is useful in codemap/efuns *)
| Local
| Param
(* For closures; can refer to a Local or Param.
* With sid this is potentially less useful for scoping-related issues,
* but this can be useful in codemap to again highlight specially
* enclosed vars.
* todo: this is currently used also for fields, but we should use another
* constructor.
* Note that it's tempting to add a depth parameter to EnclosedVar, but
* that would prevent semgrep to work because whatever the depth you are,
* if you reference the same entity, this entity must have the same
* resolved_name (sid and resolved_name_kind).
*)
| EnclosedVar (* less: add depth? *)
(* sgrep: those cases allow to match entities/modules even if they were
* aliased when imported.
* both dotted_ident must at least contain one element *)
| ImportedEntity of dotted_ident (* can also use 0 for gensym *)
| ImportedModule of module_name
(* used in Go, where you can pass types as arguments and where we
* need to resolve those cases
*)
| TypeName
(* used for C *)
| Macro
| EnumConstant
[@@deriving show { with_path = false }, eq, hash]
(* Start of big mutually recursive types because of the use of 'any'
* in OtherXxx *)
(* old: Id below used to be called Name and was generalizing also IdQualified
* but some analysis are easier when they just need to
* handle a simple Id, hence the split. For example, there was some bugs
* in sgrep because sometimes an identifier was an ident (in function header)
* and sometimes a name (when called). For naming, we also need to do
* things differently for Id vs IdQualified and would need many times to
* inspect the name.name_qualifier to know if we have an Id or IdQualified.
* We do the same split for Fid vs FName for fields.
*
* newvar: Id is sometimes abused to also introduce a newvar (as in Python)
* but ultimately those cases should be rewritten to first introduce
* a VarDef.
*
* DEBT? Sometimes some DotAccess should really be transformed in IdQualified
* with a better qualifier because the obj is actually the name of a package
* or module, but you may need advanced semantic information and global
* analysis to disambiguate. In the meantime, you can use
* AST_generic_helpers.name_of_dot_access to convert a DotAccess of idents
* into an IdQualified name.
*)
type name = Id of ident * id_info | IdQualified of qualified_info
(* A qualified (via type arguments or module/namespace/package) id.
* The type should be enough to represent Java/Rust/C++ generics.
* less: it is still not enough to represent OCaml functors applications.
*
* invariant: you can't have name_top = None, name_middle = QNone, and
* name_last = (id * None) at the same time. If that's the case, then we
* build an Id, not an Idqualified
*)
and qualified_info = {
name_last : ident * type_arguments option;
name_middle : qualifier option;
(* ::, Ruby, C++, also '`' abuse for PolyVariant in OCaml *)
name_top : tok option;
name_info : id_info;
}
and qualifier =
(* Java/C++/Rust *)
| QDots of (ident * type_arguments option) list
(* Ruby/Lua *)
| QExpr of expr * tok
(*****************************************************************************)
(* Naming/typing *)
(*****************************************************************************)
and id_info = {
id_resolved : resolved_name option ref;
(* variable tagger (naming) *)
(* sgrep: in OCaml we also use that to store the type of
* a typed entity, which can be interpreted as a TypedMetavar in semgrep.
* alt: have an explicity type_ field in entity.
*)
id_type : type_ option ref;
(* type checker (typing) *)
(* sgrep: this is for sgrep constant propagation hack.
* todo? associate only with Id?
* note that we do not use the constness for equality (hence the adhoc
* @equal below) because the constness analysis is now controlflow-sensitive
* meaning the same variable might have different id_constness value
* depending where it is used.
*)
id_constness : constness option ref; [@equal fun _a _b -> true]
(* THINK: Drop option? *)
(* id_hidden=true must be set for any artificial identifier that never
appears in source code but is introduced in the AST after parsing.
Don't use this for syntax desugaring or transpilation because the
resulting function name might exist in some source code. Consider the
following normalization:
!foo -> foo.contents
^^^^^^^^
should not be marked as hidden because it could appear
in target source code.
However, an artificial identifier like "!sh_quoted_expand!" should
be marked as hidden in bash.
This allows not breaking the -fast/-filter_irrelevant_rules optimization
that skips a target file if some identifier in the pattern AST doesn't
exist in the source of the target.
*)
id_hidden : bool;
}
(*****************************************************************************)
(* Expression *)
(*****************************************************************************)
(* todo? we could store more semantic information at each expr node,
* e.g., type information, constant evaluation.
*)
and expr = {
e : expr_kind;
e_id : int;
(* used to quickly get the range of an expression *)
mutable e_range :
(Parse_info.token_location * Parse_info.token_location) option;
[@equal fun _a _b -> true] [@hash.ignore]
}
and expr_kind =
(* basic (atomic) values *)
| L of literal
(* composite values
* TODO? element list bracket, so can encode KeyVal, Designator,
* IndexArray for C++.
* ArrayInitDesignator [x] = ... use ArrayAccess in container?
*)
| Container of container_operator * expr list bracket
| Comprehension of container_operator * comprehension bracket
(* And-type (field.vinit should be a Some) *)
| Record of field list bracket
(* Or-type (could be used instead of Container, Cons, Nil, etc.).
* (ab)used also for polymorphic variants where qualifier is QTop with
* the '`' token.
*
* Note that in OCaml constructors do not always need brackets.
* For example, you can have 'let x = Foo 1'. However, you often
* need the parenthesis, for example when you have multiple arguments
* (e.g., 'let x = Foo (1,2)').
* In that case, we could make '(1,2)' a single Tuple argument, and
* that is mostly what is done in ast_ml.ml, but this is a bit ugly.
* Morever, even with a single argument, you need extra parenthesis
* if the argument is a complex expression (e.g., 'let x = Foo(1+2)').
* And if those parenthesis are not in the AST, then matching range
* or autofix on such construct may fail.
* Finally, in some languages like Scala or Rust, constructors require
* the parenthesis.
* Thus, it is simpler to add the bracket here, because this is mostly
* how user think.
*)
| Constructor of name * expr list bracket
(* see also Call(IdSpecial (New,_), [ArgType _;...] for other values *)
| N of name
| IdSpecial of special wrap (*e: [[AST_generic.expr]] other identifier cases *)
(* operators and function application *)
| Call of expr * arguments
(* TODO? Separate regular Calls from OpCalls where no need bracket and Arg *)
(* (XHP, JSX, TSX), could be transpiled also (done in IL.ml?) *)
| Xml of xml
(* IntepolatedString of expr list is simulated with a
* Call(IdSpecial (Concat ...)) *)
(* The left part should be an lvalue (Id, DotAccess, ArrayAccess, Deref)
* but it can also be a pattern (Container, even Record), but
* you should really use LetPattern for that.
* Assign can also be abused to declare new variables, but you should use
* variable_definition for that.
* less: should be in stmt, but most languages allow this at expr level :(
* todo: see IL.ml where we normalize this AST with expr/instr/stmt
* update: should even be in a separate simple_stmt, as in Go
*)
| Assign of
expr * tok (* '=', '<-' in OCaml. ':=' Go is AssignOp (Eq) *) * expr
(* less: could desugar in Assign, should be only binary_operator *)
| AssignOp of expr * operator wrap * expr
(* newvar:! newscope:? in OCaml yes but we miss the 'in' part here *)
| LetPattern of pattern * expr (*e: [[AST_generic.expr]] other assign cases *)
(* can be used for Record, Class, or Module access depending on expr.
* In the last case it should be rewritten as a (N IdQualified) with a
* qualifier though.
* TODO? have a dot_operator to differentiate ., .?, and :: in Kotlin?
*)
| DotAccess of expr * tok (* ., ::, ->, # *) * field_name
(* in Js ArrayAccess is also abused to perform DotAccess (..., FDynamic) *)
| ArrayAccess of expr * expr bracket
(* could also use ArrayAccess with a Tuple rhs, or use a special *)
| SliceAccess of
expr
* (expr option (* lower *) * expr option (* upper *) * expr option)
(* step *)
bracket
(* very special value. 'fbody' is usually an FExpr. *)
| Lambda of function_definition
(* also a special value. Usually an argument of a New
* (used in Java, Javascript, etc.) *)
| AnonClass of class_definition
(* a.k.a ternary expression. Note that even in languages like OCaml
* where 'if's are expressions, we still prefer to use the stmt 'If'
* because it allows an optional else part. We need to sometimes
* wrap those stmts inside an OE_StmtExpr though.
* TODO: add toks? TODO? in C++ the second expr can be an option
*)
| Conditional of expr * expr * expr
| Yield of tok * expr option * bool (* 'from' for Python *)
| Await of tok * expr
(* Send/Recv of Go are currently in OtherExpr *)
| Cast of type_ * tok (* ':' or leftmost '(' or 'as' *) * expr
(* less: should be in statement *)
| Seq of expr list (* at least 2 elements *)
(* less: could be in Special, but pretty important so I've lifted them here*)
| Ref of tok (* &, address of *) * expr
| DeRef of tok (* '*' in C, '!' or '<-' in OCaml, ^ in Reason *) * expr
(* In some rare cases, we need to keep the parenthesis around an expression
* otherwise in autofix semgrep could produce incorrect code. For example,
* in Go a cast int(3.0) requires the parenthesis.
* alt: change cast to take a expr bracket, but this is used only for Go.
* Note that this data structure is really becoming more a CST than an AST.
*)
| ParenExpr of expr bracket
(* sgrep: ... in expressions, args, stmts, items, and fields
* (and unfortunately also in types in Python) *)
| Ellipsis of tok (* '...' *)
| DeepEllipsis of expr bracket (* <... ...> *)
| DisjExpr of expr * expr
| TypedMetavar of ident * tok (* : *) * type_
(* for ellipsis in method chaining *)
| DotAccessEllipsis of expr * tok (* '...' *)
(* Dual of ExprStmt. See stmt_to_expr() below and its comment.
* OCaml/Ruby/Scala/... have just expressions, not separate statements.
*)
| StmtExpr of stmt
(* e.g., TypeId in C++, MethodRef/ClassLiteral in Java, Send/Receive in Go,
* Checked/Unchecked in C#, Repr in Python, RecordWith in OCaml/C#,
* Subshell in Ruby, Delete/Unset in JS/Hack/Solidity/C++,
* Unpack/ArrayAppend in PHP (the AST for $x[] = 1 used to be
* handled as an AssignOp with special Append).
* Define/Arguments/NewTarget/YieldStar/Exports/Module/Require/UseStrict JS,
* UnitLiteral/HexString/UnicodeString/TupleHole/StructExpr in Solidity
* TODO? lift up to program attribute/directive UseStrict, Require in Import?
* TODO? of replace 'any list' by 'expr list'? any way there's still
* StmtExpr above to wrap stmt if it's not an expr but a stmt
*)
| OtherExpr of todo_kind * any list
and literal =
| Bool of bool wrap
(* the numbers are an option because OCaml numbers
* may not be able to represent all numbers. For example, OCaml integers
* are limited to 63 bits, but C integers can use 64 bits.
*)
| Int of int option wrap
| Float of float option wrap
| Char of string wrap
| String of string wrap (* TODO? bracket, ', ", or even """ *)
| Regexp of string wrap bracket (* // *) * string wrap option (* modifiers *)
| Atom of tok (* ':' in Ruby, ''' in Scala *) * string wrap
| Unit of tok
(* a.k.a Void *)
| Null of tok
| Undefined of tok (* JS *)
| Imag of string wrap
(* Go, Python *)
| Ratio of string wrap
(* The type of an unknown constant. *)
and const_type = Cbool | Cint | Cstr | Cany
(* set by the constant propagation algorithm and used in semgrep *)
and constness = Lit of literal | Cst of const_type | NotCst
and container_operator =
| Array (* todo? designator? use ArrayAccess for designator? *)
| List
| Set
(* a.k.a Hash or Map (combine with Tuple to get Key/value pair) *)
(* TODO? merge with Record *)
| Dict
(* Tuples usually contain at least 2 elements, except for Python where
* you can actually have 1-uple, e.g., '(1,)'.
*)
| Tuple
(* For Python/HCL (and Haskell later). The 'expr' can be a 'Tuple' to
* represent a Key/Value pair (like in Container). See keyval() below.
* newscope: for_or_if_comp introduce new local variables whose scope
* is just the first expr.
*)
and comprehension = expr * for_or_if_comp list
(* at least one element *)
and for_or_if_comp =
(* newvar: *)
| CompFor of tok (*'for'*) * pattern * tok (* 'in' *) * expr
| CompIf of tok (*'if'*) * expr
and field_name =
(* In the case of a field, it may be hard to resolve the id_info inside name.
* For example, a method id can refer to many method definitions.
* But for certain things, like a private field, we can resolve it
* (right now we use an EnclosedVar for those fields).
*
* The IdQualified inside name is Useful for OCaml field access.
*)
| FN of name
(* for PHP/JS fields (even though JS use ArrayAccess for that), or Ruby
* or C++ ArrowStarAccess ->*
*)
| FDynamic of expr
(* It's useful to keep track in the AST of all those special identifiers.
* They need to be handled in a special way by certain analysis and just
* using Name for them would be error-prone.
* Note though that by putting all of them together in a type, we lose
* typing information. For example, Eval takes only one argument and
* InstanceOf takes a type and an expr. This is a tradeoff to also not
* polluate too much expr with too many constructs.
* TODO: split in IdSpecial of special_id and CallSpecial of special_op
* and then also just CallOp. And also a separate InterpolatedString.
*)
and special =
(* special vars *)
| This
| Super (* called 'base' in C# *)
(* less: how different self/parent is from this/super? *)
| Self
| Parent
(* for Lua, todo: just remove it, create Dict without key *)
| NextArrayIndex
(* special calls *)
| Eval
| Typeof (* for C? and Go in switch x.(type) *)
(* todo? make a SpecialType, also use for ClassLiteral *)
| Instanceof
| Sizeof (* takes a ArgType *)
| Defined (* defined? in Ruby, other? *)
(* Note that certain languages do not have a 'new' keyword
* (e.g., Python, Scala 3), instead certain 'Call' are really 'New'.
* Note that 'new' by itself is not a valid expression
*)
| New (* usually associated with Call(New, [ArgType _;...]) *)
(* used for interpolated strings constructs
* TODO: move out of 'special' and make special construct InterpolatedConcat
* in 'expr' instead of abusing Call for that? that way can also
* avoid those InterpolatedElement stuff.
*)
| ConcatString of concat_string_kind
| EncodedString of string (* only for Python for now (e.g., b"foo") *)
(* TaggedString? for Javascript, for styled.div`bla{xx}`?
* We could have this TaggedString where the first arg of Call
* will be the tagging function, and the rest will be a Call ConcatString.
* However, it is simpler to just transform those special calls as
* regular calls even though they do not have parenthesis
* (not all calls have parenthesis anyway, as in OCaml or Ruby).
*)
(* Use this to separate interpolated elements in interpolated strings
* but this is a bit of a hack.
* TODO: We should probably add InterpolatedConcat as an expression
*)
| InterpolatedElement
(* "Inline" the content of a var containing a list (a.k.a Splat in Ruby).
* Used in a Container or Call argument context.
* The corresponding constructor in a parameter context is ParamRest.
*)
| Spread (* ...x in JS, *x in Python/Ruby *)
(* Similar to Spread, but for a var containing a hashtbl.
* The corresponding constructor in a parameter context is ParamHashSplat.
*)
| HashSplat (* **x in Python/Ruby
* (not to confused with Pow below which is a Binary op *)
| ForOf (* Javascript, for generators, used in ForEach *)
(* used for unary and binary operations
* TODO: move out of special too, in separate OpCall? (where can also
* have 1 or 2 argument (or maybe even 0 for op reference?)
*)
| Op of operator
(* less: should be lift up and transformed in Assign at stmt level *)
| IncrDecr of (incr_decr * prefix_postfix)
(* mostly binary operators.
* less: could be divided in really Arith vs Logical (bool) operators,
* but see is_boolean_operator() helper below.
* Note that Mod can be used for %style string formatting in Python.
* Note that Plus can also be used for string concatenations in Go/??.
* todo? use a Special operator intead for that? but need type info?
*)
and operator =
(* unary too *)
| Plus
(* unary too *)
| Minus
| Mult
| Div
| Mod
| Pow (* ** binary op; for unary see HashSplat above *)
| FloorDiv
| MatMult (* Python *)
| LSL
| LSR
| ASR (* L = logic, A = Arithmetic, SL = shift left *)
| BitOr
| BitXor
| BitAnd
(* unary too *)
| BitNot
| BitClear (* Go *)
(* And/Or are also shortcut operator.
* todo? rewrite in CondExpr? They have a special behavior.
*)
| And
| Or
(* PHP has a xor shortcut operator ... hmmm *)
| Xor
(* unary *)
| Not
| Eq (* '=' in OCaml, '==' in Go/... *)
| NotEq (* less: could be desugared to Not Eq *)
| PhysEq (* '==' in OCaml, '===' in JS/... *)
| NotPhysEq (* less: could be desugared to Not PhysEq *)
| Lt
| LtE
| Gt
| GtE (* less: could be desugared to Or (Eq Lt) *)
| Cmp (* <=>, PHP *)
| Concat (* '.' PHP, '..' Lua *)
| Append (* x[] = ... in PHP, just in AssignOp *)
| RegexpMatch (* =~, Ruby (and Perl) *)
| NotMatch (* !~ Ruby less: could be desugared to Not RegexpMatch *)
| Range (* .. or ..., Ruby, one arg can be nil for endless range *)
| RangeInclusive (* '..=' in Rust *)
| NotNullPostfix (* ! in Typescript, postfix operator *)
| Length (* '#' in Lua *)
(* See https://en.wikipedia.org/wiki/Elvis_operator.
* In PHP we currently generate a Conditional instead of a Binary Elvis.
* It looks like the Nullish operator is quite similar to the Elvis
* operator, so we may want to merge those operators at some point.
*)
| Elvis (* ?: in Kotlin, can compare possible null value *)
| Nullish (* ?? in Javascript *)
| In (* in: checks that value belongs to a collection *)
| NotIn (* !in *)
(* Is (and NotIs) checks whether a value has a certain type.
* The second argument in OpCall is always an ArgType!
* Can also be used as an unary operation in Kotlin in a 'when'
*)
| Is
| NotIs
(* Shell & and | *)
| Background
| Pipe
(* '++', '--' *)
and incr_decr = Incr | Decr
and prefix_postfix = Prefix | Postfix
and concat_string_kind =
(* many languages do not require a special syntax to use interpolated
* strings e.g. simply "this is {a}". Javascript uses backquotes.
*)
| InterpolatedConcat (* Javascript/PHP/Ruby/Perl *)
(* many languages have a binary Concat operator to concatenate strings,
* but some languages also allow the simple juxtaposition of multiple
* strings to be concatenated, e.g. "hello" "world" in Python.
*)
| SequenceConcat (* Python/C *)
(* Python requires the special f"" syntax to use interpolated strings,
* and some semgrep users may want to explicitely match only f-strings,
* which is why we record this information here.
* update: also use for interpolated Scala strings
* TODO: add of string ('f' or something else)
*)
| FString
(* Javascript uses a special syntax called tagged template literals, e.g.,
* foo`template string = ${id}`. We must use a different representation
* for foo(`template string = ${id}`) because some semgrep users want
* to find one and not the other.
* In both case it will be inside Call (Apply (ConcatString ... but then
* the kind will differ.
* example: foo`template ${id}`
*)
| TaggedTemplateLiteral
(* This is for JSX/TSX in javascript land (old: and XHP in PHP land).
* less: we could make it more generic by adding a 'expr so it could be
* reused in ast_js.ml, ast_php.ml
*)
and xml = {
xml_kind : xml_kind;
xml_attrs : xml_attribute list;
xml_body : xml_body list;
}
and xml_kind =
| XmlClassic of tok (*'<'*) * ident * tok (*'>'*) * tok (*'</foo>'*)
| XmlSingleton of tok (*'<'*) * ident * tok (* '/>', with xml_body = [] *)
(* React/JS specific *)
| XmlFragment of tok (* '<>' *) * (* '</>', with xml_attrs = [] *) tok
and xml_attribute =
| XmlAttr of ident * tok (* = *) * a_xml_attr_value
(* less: XmlAttrNoValue of ident. <foo a /> <=> <foo a=true /> *)
(* jsx: usually a Spread operation, e.g., <foo {...bar} /> *)
| XmlAttrExpr of expr bracket
(* sgrep: *)
| XmlEllipsis of tok
(* either a String or a bracketed expr, but right now we just use expr *)
and a_xml_attr_value = expr
and xml_body =
(* sgrep-ext: can contain "..." *)
| XmlText of string wrap
(* this can be None when people abuse {} to put comments in it *)
| XmlExpr of expr option bracket
| XmlXml of xml
(* brackets can be fake '()' for OCaml/Ruby *)
and arguments = argument list bracket
and argument =
(* regular argument *)
| Arg of expr (* can be Call (IdSpecial Spread, Id foo) *)
(* keyword argument *)
| ArgKwd of ident * expr
(* type argument for New, instanceof/sizeof/typeof, C macros *)
| ArgType of type_
(* e.g., ArgMacro for C/Rust, ArgQuestion for OCaml, ArgIds in Solidity *)
| OtherArg of todo_kind * any list
(*****************************************************************************)
(* Statement *)
(*****************************************************************************)
and stmt = {
s : stmt_kind;
[@equal AST_utils.equal_stmt_field_s equal_stmt_kind] [@hash.ignore]
(* this can be used to compare and hash more efficiently stmts,
* or in semgrep to quickly know if a stmt is a children of another stmt.
*)
s_id : AST_utils.Node_ID.t; [@equal AST_utils.equal_stmt_field_s_id]
(* todo? we could store a range: (tok * tok) to delimit the range of a stmt
* which would allow us to remove some of the extra 'tok' in stmt_kind.
* Indeed, the main use of those 'tok' is to accurately report a match range
* in semgrep.
*)
mutable s_use_cache : bool; [@equal fun _a _b -> true] [@hash.ignore]
(* whether this is a strategic point for match result caching.
This field is relevant for patterns only.
This applies to the caching optimization, in which the results of
matching lists of statements can be cached. A list of statements
is identified by its leading node. In the current implementation,
the fields 's_id', 's_use_caching', and 's_backrefs' are treated as
properties of a (non-empty) list of statements, rather than of individual
statements. A cleaner implementation would consist of a custom
list type in which each list has these properties, including the
empty list.
*)
mutable s_backrefs : AST_utils.String_set.t option;
[@equal fun _a _b -> true] [@hash.ignore]
(* set of metavariables referenced in the "rest of the pattern", as
determined by matching order.
This field is relevant for patterns only.
This is used to determine which of the bound
metavariables should be added to the cache key for this node.
This field is set on pattern ASTs only, in a pass right after parsing
and before matching.
*)
(* used in semgrep to skip some AST matching *)
mutable s_bf : Bloom_filter.t option;
[@equal fun _a _b -> true] [@hash.ignore]
(* used to quickly get the range of a statement *)
mutable s_range :
(Parse_info.token_location * Parse_info.token_location) option;
[@equal fun _a _b -> true] [@hash.ignore]
}
and stmt_kind =
(* See also IL.ml where Call/Assign/Seq are not in expr and where there are
* separate expr, instr, and stmt types *)
| ExprStmt of expr * sc (* fake tok in Python, but also in JS/Go with ASI *)
(* newscope: in C++/Java/Go *)
| Block of stmt list bracket (* can be fake {} in Python where use layout *)
(* EmptyStmt = Block [], or separate so can not be matched by $S? $
* see also emptystmt() at the end of this file.
*)
(* newscope: for vardef in condition in C++/Go/... *)
| If of tok (* 'if' or 'elif' *) * condition * stmt * stmt option
| While of tok * condition * stmt
| Return of tok * expr option * sc
| DoWhile of tok * stmt * expr
(* newscope: *)
| For of tok (* 'for', 'foreach'*) * for_header * stmt
(* The expr can be None for Go and Ruby.
* less: could be merged with ExprStmt (MatchPattern ...) *)
| Switch of
tok (* 'switch', also 'select' in Go, or 'case' in Bash *)
* condition option
* case_and_body list (* TODO brace *)
(* todo: merge Match with Switch?
* In Scala and C# the match is infix (after the expr)
*)
| Match of tok * expr * action list
| Continue of tok * label_ident * sc
| Break of tok * label_ident * sc
(* todo? remove stmt argument? more symetric to Goto *)
| Label of label * stmt
| Goto of tok * label * sc (* less: use label_ident for computed goto in C*)
(* TODO? move in expr! in C++ the expr can be an option *)
| Throw of tok (* 'raise' in OCaml, 'throw' in Java/PHP *) * expr * sc
| Try of tok * stmt * catch list * finally option
| WithUsingResource of
tok (* 'with' in Python, 'using' in C# *)
* stmt (* resource acquisition *)
* stmt (* newscope: block *)
(* old: was 'expr * expr option' for Python/Java, but better to generalize.
* alt: could move in expr and have Assert be an IdSpecial
*)
| Assert of tok * arguments * sc
(* TODO? move this out of stmt and have a stmt_or_def_or_dir in Block?
* or an item list where item is a stmt_or_def_or_dir (as well as field)
*)
| DefStmt of definition
| DirectiveStmt of directive
(* sgrep: *)
| DisjStmt of stmt * stmt
(* This is important to correctly compute a CFG. The any should not
* contain any stmt! *)
| OtherStmtWithStmt of other_stmt_with_stmt_operator * any list * stmt
(* any here should _not_ contain any statement! otherwise the CFG will be
* incorrect and some analysis (e.g., liveness) will be incorrect.
* TODO: other_stmt_operator wrap, so enforce at least one token instead
* of relying that the any list contains at least one token
*)
| OtherStmt of other_stmt_operator * any list
(* TODO: can also introduce var in some languages
* less: factorize with for_var_or_expr
*)
and condition =
| Cond of expr
(* e.g., CondWithDecl, CondDecl in C++ *)
| OtherCond of todo_kind * any list
(* newscope: *)
(* less: could merge even more with pattern
* list = PatDisj and Default = PatUnderscore,
* so case_and_body of Switch <=> action of MatchPattern
*)
and case_and_body =
| CasesAndBody of (case list * stmt)
(* sgrep: *)
| CaseEllipsis of (* ... *) tok
(* less: we use expr_to_pattern for many languages to build a Case so
* maybe we should instead have a CaseExpr and CasePattern?
*)
and case =
| Case of tok * pattern
(* less: could unsugar as Case (PatUnderscore _) *)
| Default of tok
(* For Go, expr can contain some Assign bindings.
* todo? could merge with regular Case? can 'case x := <-chan' be
* transformed in a pattern?
*)
| CaseEqualExpr of tok * expr
(* e.g., CaseRange for C++ *)
| OtherCase of todo_kind * any list
(* todo: merge with case at some point *)
(* newscope: newvar: *)
and action = pattern * expr
(* newvar: newscope: usually a PatVar *)
and catch = tok (* 'catch', 'except' in Python *) * catch_exn * stmt
(* alt: we could reuse parameter, which has a ParamPattern and ParamClassic *)
and catch_exn =
| CatchPattern of pattern
(* for Java/C++/PHP/etc.
* old: PatVar of type_ * (ident * id_info) option
* and was in pattern as PatVar, but better to move out of pattern.
* alt: we could abuse pattern and use PatTyped, but ugly.
*)
| CatchParam of parameter_classic
(* e.g., CatchEmpty/CatchParams in Solidity *)
| OtherCatch of todo_kind * any list
(* ptype should never be None *)
(* newscope: *)
and finally = tok (* 'finally' *) * stmt
and label = ident
and label_ident =
| LNone (* C/Python *)
| LId of label (* Java/Go/Kotlin *)
| LInt of int wrap (* PHP *)
(* PHP, woohoo, dynamic break! bailout for CFG, also in C gccext: goto *)
| LDynamic of expr
(* todo? put also user-defined iterators here? like MacroIteration in C++ *)
and for_header =
(* todo? copy Go and have 'of simple option * expr * simple option'? *)
| ForClassic of
for_var_or_expr list (* init *) * expr option (* cond *) * expr option (* next *)
(* newvar: *)
| ForEach of
pattern * tok (* 'in' Python, 'range' Go, 'as' PHP, '' Java *) * expr (* pattern 'in' expr *)
(* Lua. todo: merge with ForEach? *)
(* pattern 'in' expr *)
| ForIn of for_var_or_expr list (* init *) * expr list
(* sgrep: *)
| ForEllipsis of (* ... *) tok
and for_var_or_expr =
(* newvar: *)
| ForInitVar of entity * variable_definition
(* less: should rename ForInitAssign really *)
| ForInitExpr of expr
and other_stmt_with_stmt_operator =
(* Python/Javascript *)
(* TODO: used in C# with 'Using', make new stmt TryWithResource? do Java?*)
| OSWS_With (* newscope: newvar: in OtherStmtWithStmt with LetPattern *)
(* Ruby *)
| OSWS_BEGIN
| OSWS_END (* also in Awk, Perl? *)
| OSWS_Else_in_try
(* Rust *)
| OSWS_UnsafeBlock
| OSWS_AsyncBlock
| OSWS_ConstBlock