From 878e710fb690a2f53376df387f05bb644b3c823f Mon Sep 17 00:00:00 2001 From: Brandon Wu <49291449+brandonspark@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:56:22 -0700 Subject: [PATCH] fix(kotlin): classes with newlines (#476) ## What: This PR makes it so we can parse Kotlin classes that have a newline between the class identifier and constructor. ## Why: Parse rate. ## How: The problem is in cases like: ``` class Foo constructor Bar() { ... } ``` Here, we insert an automatic semicolon between `Foo` and `constructor`. This leaves us able to parse `class Foo` as a `class_declaration`, but `constructor Bar ...` is not allowed on its own. We simply allow `constructor Bar ...` to be a standalone statement, and resolve to stitch them together at parsing time. ### Testing Tested with `make test` and `test-lang kotlin` ### Security - [ ] Change has no security implications (otherwise, ping the security team) --------- Co-authored-by: Amarin Phaosawasdi --- .../src/semgrep-kotlin/grammar.js | 41 +++- .../semgrep-kotlin/test/corpus/semgrep.txt | 179 ++++++++++++++---- 2 files changed, 184 insertions(+), 36 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index 17df70a..fc30abb 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -6,7 +6,7 @@ */ // INVARIATN: Make sure that you are merging any commits into the `semgrep` -// branch of `tree-sitter-kotlin`! This is because our version of +// branch of `tree-sitter-kotlin`! This is because our version of // `tree-sitter-kotlin` is forked from the original repository, and we // want our branch to be kept separate. @@ -40,7 +40,7 @@ module.exports = grammar(standard_grammar, { typed_metavar: $ => seq( "(", $.simple_identifier, ":", $._type, ")" - ), + ), // Statement ellipsis: '...' not followed by ';' _expression: ($, previous) => { @@ -74,13 +74,48 @@ module.exports = grammar(standard_grammar, { ); }, + // We would like to be able to parse programs which have a newline between the + // class name and the constructor: + // class Foo + // constructor Bar() { ... } + + // The problem is that the Kotlin parser inserts a semicolon after "Foo", making + // it such that we get interrupted in the middle of the class_declaration. + // To make it so we can continue, we allow everything after the class identifier + // to be a standalone statement in its own right. This way, we can parse both parts + // individually, and stitch them together at parsing time. + + // We only need to amend statements here, because the consumers of _declaration are + // only class_member_declaration, top_level_object and _statement. + // The former has `secondary_constructor`, which already looks like what we want to + // add, and the second seems to be unused. + // So we just need to fix _statement. + + // A more proper fix would likely require changes to the external scanner to properly + // handle automatic semicolon insertion, which is the cause of this whole issue. + // We filed an issue to tree-sitter-kotlin to track this: + // https://github.com/fwcd/tree-sitter-kotlin/issues/75 + _statement: ($, previous) => choice( + ...previous.members, + $.partial_class_declaration, + ), + + partial_class_declaration: $ => prec.left(seq( + optional($.type_parameters), + seq(optional($.modifiers), "constructor"), + $._class_parameters, + optional(seq(":", $._delegation_specifiers)), + optional($.type_constraints), + optional($.class_body) + )), + class_parameter: ($, previous) => { return choice( previous, $.ellipsis ); }, - + deep_ellipsis: $ => seq( '<...', $._expression, '...>' ), diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt index f12c8c4..b4f6687 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt +++ b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt @@ -32,7 +32,6 @@ class $CLASS { (value_argument (integer_literal))))))))))) - ===================================== Typed Metavariables ===================================== @@ -54,7 +53,7 @@ val x = ($X : int) Ellipsis Metavariable ===================================== -$...X +$...X val x = 2 --- @@ -67,7 +66,7 @@ val x = 2 (integer_literal))) ===================================== -Expression Ellipsis +Expression Ellipsis ===================================== class Foo { @@ -89,7 +88,6 @@ class Foo { (statements (ellipsis))))))) - ===================================== Deep Ellipsis ===================================== @@ -132,9 +130,12 @@ class Foo { } --- + (source_file - (class_declaration (type_identifier) - (class_body (ellipsis)))) + (class_declaration + (type_identifier) + (class_body + (ellipsis)))) ===================================== Argument Ellipsis @@ -142,12 +143,18 @@ Argument Ellipsis foo(1, ..., 2) --- + (source_file - (call_expression (simple_identifier) - (call_suffix (value_arguments - (value_argument (integer_literal)) - (value_argument (ellipsis)) - (value_argument (integer_literal)))))) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments + (value_argument + (integer_literal)) + (value_argument + (ellipsis)) + (value_argument + (integer_literal)))))) ===================================== Parameter Ellipsis @@ -157,12 +164,16 @@ fun foo(..., bar: String, ...) {} --- (source_file - (function_declaration (simple_identifier) - (function_value_parameters + (function_declaration + (simple_identifier) + (function_value_parameters (ellipsis) - (parameter (simple_identifier) (user_type (type_identifier))) + (parameter + (simple_identifier) + (user_type + (type_identifier))) (ellipsis)) - (function_body))) + (function_body))) ===================================== Class Parameter Ellipsis @@ -171,11 +182,17 @@ class Foo(...) : Filter { } --- + (source_file - (class_declaration (type_identifier) - (primary_constructor (class_parameter (ellipsis))) - (delegation_specifier (user_type (type_identifier))) - (class_body))) + (class_declaration + (type_identifier) + (primary_constructor + (class_parameter + (ellipsis))) + (delegation_specifier + (user_type + (type_identifier))) + (class_body))) ===================================== Statement Ellipsis @@ -187,15 +204,22 @@ fun foo() { } --- + (source_file - (function_declaration (simple_identifier) + (function_declaration + (simple_identifier) (function_value_parameters) (function_body (statements - (call_expression (simple_identifier) (call_suffix (value_arguments))) - (ellipsis) - (call_expression (simple_identifier) (call_suffix (value_arguments)))) - ))) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments))) + (ellipsis) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments))))))) ===================================== Statement Ellipsis and metavarible @@ -205,11 +229,17 @@ $COOKIE.setValue("") ... --- + (source_file (call_expression - (navigation_expression (simple_identifier) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments (value_argument (string_literal))))) + (navigation_expression + (simple_identifier) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments + (value_argument + (string_literal))))) (ellipsis)) ===================================== @@ -218,14 +248,97 @@ Method chaining Ellipsis obj. ... .foo().bar() --- + (source_file (call_expression (navigation_expression (call_expression (navigation_expression - (navigation_expression (simple_identifier) - (navigation_suffix (ellipsis))) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments))) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments)))) + (navigation_expression + (simple_identifier) + (navigation_suffix + (ellipsis))) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments))) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments)))) + +===================================== +Class with a newline +===================================== + +class BlahClass +constructor(arg: Int) { + private val blah2 = "foo${arg}" +} + +--- + +(source_file + (class_declaration + (type_identifier)) + (partial_class_declaration + (class_parameter + (simple_identifier) + (user_type + (type_identifier))) + (class_body + (property_declaration + (modifiers + (visibility_modifier)) + (variable_declaration + (simple_identifier)) + (string_literal + (interpolated_expression + (simple_identifier))))))) + +===================================== +Class in a class +===================================== + +class Foo { + class BlahClass + constructor(arg: Int) { + private val blah2 = "foo${arg}" + } +} + +--- + +(source_file + (class_declaration + (type_identifier) + (class_body + (class_declaration + (type_identifier)) + (secondary_constructor + (function_value_parameters + (parameter + (simple_identifier) + (user_type + (type_identifier)))) + (statements + (property_declaration + (modifiers + (visibility_modifier)) + (variable_declaration + (simple_identifier)) + (string_literal + (interpolated_expression + (simple_identifier))))))))) + +===================================== +Class without body +===================================== + +class Foo + +--- + +(source_file + (class_declaration + (type_identifier)))