Spark3: Support for LATERAL VIEW clause (#2687)

* allow named argument to accept an iterable * updated to allow for LATERAL VIEW * black * use OneOf instead of list for exclude * use get_segment to define get_eventual_alias * remove unused imports * PR feedback * black * Update src/sqlfluff/dialects/dialect_spark3.py Co-authored-by: Barry Pollard <barry_pollard@hotmail.com> * remove AnyNumberOf AliasExpressionSegments * black * remove two invalid test cases and update LateralViewClauseSegment * refresh yml * updates to LateralViewClauseSegment * excldue spark3 from L026 by default * black Co-authored-by: Barry Pollard <barry_pollard@hotmail.com>
sqlfluff · Feb 18, 2022 · ef27896 · ef27896
1 parent 0307c76
commit ef27896
Show file tree

Hide file tree

Showing 5 changed files with 469 additions and 14 deletions.
diff --git a/src/sqlfluff/core/parser/grammar/anyof.py b/src/sqlfluff/core/parser/grammar/anyof.py
@@ -2,18 +2,18 @@
 
 from typing import List, Optional, Tuple
 
-from sqlfluff.core.parser.helpers import trim_non_code_segments
-from sqlfluff.core.parser.match_result import MatchResult
-from sqlfluff.core.parser.match_wrapper import match_wrapper
-from sqlfluff.core.parser.match_logging import parse_match_logging
 from sqlfluff.core.parser.context import ParseContext
-from sqlfluff.core.parser.segments import BaseSegment, allow_ephemeral
 from sqlfluff.core.parser.grammar.base import (
     BaseGrammar,
     MatchableType,
     cached_method_for_parse_context,
 )
 from sqlfluff.core.parser.grammar.sequence import Sequence, Bracketed
+from sqlfluff.core.parser.helpers import trim_non_code_segments
+from sqlfluff.core.parser.match_logging import parse_match_logging
+from sqlfluff.core.parser.match_result import MatchResult
+from sqlfluff.core.parser.match_wrapper import match_wrapper
+from sqlfluff.core.parser.segments import BaseSegment, allow_ephemeral
 
 
 class AnyNumberOf(BaseGrammar):

diff --git a/src/sqlfluff/dialects/dialect_spark3.py b/src/sqlfluff/dialects/dialect_spark3.py
@@ -181,7 +181,7 @@
         Sequence("CLUSTER", "BY"),
         Sequence("DISTRIBUTE", "BY"),
         Sequence("SORT", "BY"),
-        # TODO Add PIVOT, LATERAL VIEW, and DISTRIBUTE BY clauses
+        # TODO Add PIVOT, and DISTRIBUTE BY clauses
         "HAVING",
         "WINDOW",
         Ref("SetOperatorSegment"),
@@ -1281,7 +1281,7 @@ class UnorderedSelectStatementSegment(BaseSegment):
     parse_grammar = ansi_dialect.get_segment(
         "UnorderedSelectStatementSegment"
     ).parse_grammar.copy(
-        # TODO Insert: PIVOT and LATERAL VIEW clauses
+        # TODO Insert: PIVOT clause
         # Removing non-valid clauses that exist in ANSI dialect
         remove=[Ref("OverlapsClauseSegment", optional=True)]
     )
@@ -1515,6 +1515,36 @@ class SamplingExpressionSegment(BaseSegment):
     )
 
 
+@spark3_dialect.segment()
+class LateralViewClauseSegment(BaseSegment):
+    """A `LATERAL VIEW` like in a `FROM` clause.
+
+    https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-lateral-view.html
+    """
+
+    type = "lateral_view_clause"
+
+    match_grammar = Sequence(
+        Indent,
+        "LATERAL",
+        "VIEW",
+        Ref.keyword("OUTER", optional=True),
+        Ref("FunctionSegment"),
+        # NB: AliasExpressionSegment is not used here for table
+        # or column alias because `AS` is optional within it
+        # (and in most scenarios). Here it's explicitly defined
+        # for when it is required and not allowed.
+        Ref("SingleIdentifierGrammar", optional=True),
+        Sequence(
+            "AS",
+            Delimited(
+                Ref("SingleIdentifierGrammar"),
+            ),
+        ),
+        Dedent,
+    )
+
+
 # Auxiliary Statements
 @spark3_dialect.segment()
 class AddExecutablePackage(BaseSegment):
@@ -1726,7 +1756,10 @@ class AliasExpressionSegment(BaseSegment):
             ),
             # just a table alias
             Ref("SingleIdentifierGrammar"),
-            exclude=Ref("JoinTypeKeywords"),
+            exclude=OneOf(
+                "LATERAL",
+                Ref("JoinTypeKeywords"),
+            ),
         ),
     )
 
@@ -1844,3 +1877,32 @@ class FileReferenceSegment(BaseSegment):
         # to match as a `TableReferenceSegment`
         Ref("QuotedIdentifierSegment"),
     )
+
+
+@spark3_dialect.segment(replace=True)
+class FromExpressionElementSegment(BaseSegment):
+    """A table expression.
+
+    Enhanced from ANSI to allow for `LATERAL VIEW` clause
+    """
+
+    type = "from_expression_element"
+    match_grammar = Sequence(
+        Ref("PreTableFunctionKeywordsGrammar", optional=True),
+        OptionallyBracketed(Ref("TableExpressionSegment")),
+        AnyNumberOf(Ref("LateralViewClauseSegment")),
+        OneOf(
+            Sequence(
+                Ref("AliasExpressionSegment"),
+                Ref("SamplingExpressionSegment"),
+            ),
+            Ref("SamplingExpressionSegment"),
+            Ref("AliasExpressionSegment"),
+            optional=True,
+        ),
+        Ref("PostTableExpressionGrammar", optional=True),
+    )
+
+    get_eventual_alias = ansi_dialect.get_segment(
+        "FromExpressionElementSegment"
+    ).get_eventual_alias
diff --git a/src/sqlfluff/rules/L026.py b/src/sqlfluff/rules/L026.py
@@ -3,19 +3,19 @@
 from typing import cast, List, Optional, Tuple
 
 from sqlfluff.core.dialects.base import Dialect
+from sqlfluff.core.dialects.common import AliasInfo
 from sqlfluff.core.rules.analysis.select_crawler import (
     Query as SelectCrawlerQuery,
     SelectCrawler,
 )
-from sqlfluff.core.dialects.common import AliasInfo
 from sqlfluff.core.rules.base import (
     BaseRule,
     LintResult,
     RuleContext,
     EvalResultType,
 )
-from sqlfluff.core.rules.functional import sp
 from sqlfluff.core.rules.doc_decorators import document_configuration
+from sqlfluff.core.rules.functional import sp
 from sqlfluff.core.rules.reference import object_ref_matches_table
 
 
@@ -31,9 +31,9 @@ class Rule_L026(BaseRule):
     """References cannot reference objects not present in ``FROM`` clause.
 
     .. note::
-       This rule is disabled by default for BigQuery due to its use of
-       structs which trigger false positives. It can be enabled with the
-       ``force_enable = True`` flag.
+       This rule is disabled by default for BigQuery, Hive, Redshift, and Spark3
+       due to the use of structs and lateral views which trigger false positives.
+       It can be enabled with the ``force_enable = True`` flag.
 
     **Anti-pattern**
 
@@ -64,7 +64,7 @@ def _eval(self, context: RuleContext) -> EvalResultType:
         self.force_enable: bool
 
         if (
-            context.dialect.name in ["bigquery", "hive", "redshift"]
+            context.dialect.name in ["bigquery", "hive", "redshift", "spark3"]
             and not self.force_enable
         ):
             return LintResult()

diff --git a/test/fixtures/dialects/spark3/select_from_lateral_view.sql b/test/fixtures/dialects/spark3/select_from_lateral_view.sql
@@ -0,0 +1,48 @@
+SELECT
+    id,
+    name,
+    age,
+    class,
+    address,
+    c_age,
+    d_age
+FROM person
+    LATERAL VIEW EXPLODE(ARRAY(30, 60)) tbl_name AS c_age
+    LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age;
+
+SELECT
+    c_age,
+    COUNT(*) AS record_count
+FROM person
+    LATERAL VIEW EXPLODE(ARRAY(30, 60)) AS c_age
+    LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age
+GROUP BY c_age;
+
+SELECT
+    id,
+    name,
+    age,
+    class,
+    address,
+    c_age,
+    d_age
+FROM person
+    LATERAL VIEW EXPLODE(ARRAY()) tbl_name AS c_age;
+
+SELECT
+    id,
+    name,
+    age,
+    class,
+    address,
+    c_age
+FROM person
+    LATERAL VIEW OUTER EXPLODE(ARRAY()) tbl_name AS c_age;
+
+SELECT
+    person.id,
+    exploded_people.name,
+    exploded_people.age,
+    exploded_people.state
+FROM person
+    LATERAL VIEW INLINE(array_of_structs) exploded_people AS name, age, state