sqlfluff · tunetheweb · Jul 1, 2022 · Jun 30, 2022 · Jun 30, 2022 · Jul 1, 2022
diff --git a/src/sqlfluff/dialects/dialect_sparksql.py b/src/sqlfluff/dialects/dialect_sparksql.py
@@ -265,6 +265,25 @@
         Ref("SingleQuotedIdentifierSegment"),
         Ref("BackQuotedIdentifierSegment"),
     ),
+    WhereClauseTerminatorGrammar=OneOf(
+        "LIMIT",
+        Sequence(
+            OneOf(
+                "CLUSTER",
+                "DISTRIBUTE",
+                "GROUP",
+                "ORDER",
+                "SORT",
+            ),
+            "BY",
+        ),
+        Sequence("ORDER", "BY"),
+        Sequence("DISTRIBUTE", "BY"),
+        "HAVING",
+        "QUALIFY",
+        "WINDOW",
+        "OVERLAPS",
+    ),
 )
 
 sparksql_dialect.add(
@@ -540,6 +559,20 @@
             type="literal",
         ),
     ),
+    GroupByClauseTerminatorGrammar=OneOf(
+        Sequence(
+            OneOf(
+                "ORDER",
+                "DISTRIBUTE",
+                "CLUSTER",
+                "SORT",
+            ),
+            "BY",
+        ),
+        "LIMIT",
+        "HAVING",
+        "WINDOW",
+    ),
 )
 
 # Adding Hint related grammar before comment `block_comment` and
@@ -1424,7 +1457,7 @@ class GroupByClauseSegment(ansi.GroupByClauseSegment):
 
     match_grammar = StartsWith(
         Sequence("GROUP", "BY"),
-        terminator=OneOf("ORDER", "LIMIT", "HAVING", "WINDOW"),
+        terminator=Ref("GroupByClauseTerminatorGrammar"),
         enforce_whitespace_preceding_terminator=True,
     )
 
@@ -1442,7 +1475,7 @@ class GroupByClauseSegment(ansi.GroupByClauseSegment):
                 Ref("CubeRollupClauseSegment"),
                 Ref("GroupingSetsClauseSegment"),
             ),
-            terminator=OneOf("ORDER", "LIMIT", "HAVING", "WINDOW"),
+            terminator=Ref("GroupByClauseTerminatorGrammar"),
         ),
         # TODO: New Rule
         #  Warn if CubeRollupClauseSegment and
@@ -2762,3 +2795,19 @@ class RestoreTableStatementSegment(BaseSegment):
             Ref("VersionAsOfGrammar"),
         ),
     )
+
+
+class HavingClauseSegment(ansi.HavingClauseSegment):
+    """A `HAVING` clause."""
+
+    type = "having_clause"
+    match_grammar = ansi.HavingClauseSegment.match_grammar.copy()
+    match_grammar.terminator = match_grammar.terminator.copy(  # type: ignore
+        insert=[
+            Sequence(
+                OneOf("CLUSTER", "DISTRIBUTE", "SORT"),
+                "BY",
+            ),
+        ],
+    )
+    parse_grammar = ansi.HavingClauseSegment.parse_grammar
diff --git a/test/fixtures/dialects/sparksql/issue_3484.sql b/test/fixtures/dialects/sparksql/issue_3484.sql
@@ -0,0 +1,10 @@
+-- https://github.com/sqlfluff/sqlfluff/issues/3484
+WITH cte AS (
+    SELECT *
+    FROM source
+    WHERE col1 = 0
+    DISTRIBUTE BY col1
+),
+
+SELECT *
+FROM cte
diff --git a/test/fixtures/dialects/sparksql/issue_3484.yml b/test/fixtures/dialects/sparksql/issue_3484.yml
@@ -0,0 +1,58 @@
+# YML test files are auto-generated from SQL files and should not be edited by
+# hand. To help enforce this, the "hash" field in the file must match a hash
+# computed by SQLFluff when running the tests. Please run
+# `python test/generate_parse_fixture_yml.py`  to generate them after adding or
+# altering SQL files.
+_hash: 7f337f5742ac96fa2fb84a92a52d4c994f73d350be13d6e645d59cbe945af2c4
+file:
+  statement:
+    with_compound_statement:
+      keyword: WITH
+      common_table_expression:
+        identifier: cte
+        keyword: AS
+        bracketed:
+          start_bracket: (
+          select_statement:
+            select_clause:
+              keyword: SELECT
+              select_clause_element:
+                wildcard_expression:
+                  wildcard_identifier:
+                    star: '*'
+            from_clause:
+              keyword: FROM
+              from_expression:
+                from_expression_element:
+                  table_expression:
+                    table_reference:
+                      identifier: source
+            where_clause:
+              keyword: WHERE
+              expression:
+                column_reference:
+                  identifier: col1
+                comparison_operator:
+                  raw_comparison_operator: '='
+                literal: '0'
+            distribute_by_clause:
+            - keyword: DISTRIBUTE
+            - keyword: BY
+            - column_reference:
+                identifier: col1
+          end_bracket: )
+      comma: ','
+      select_statement:
+        select_clause:
+          keyword: SELECT
+          select_clause_element:
+            wildcard_expression:
+              wildcard_identifier:
+                star: '*'
+        from_clause:
+          keyword: FROM
+          from_expression:
+            from_expression_element:
+              table_expression:
+                table_reference:
+                  identifier: cte
diff --git a/test/fixtures/dialects/sparksql/select_cluster_by.sql b/test/fixtures/dialects/sparksql/select_cluster_by.sql
@@ -30,3 +30,25 @@ SELECT
 FROM person
 CLUSTER BY
     LEFT(SUBSTRING_INDEX(name, ' ', -1), 1);
+
+SELECT
+    age,
+    name
+FROM person
+WHERE age <= 100
+CLUSTER BY age;
+
+SELECT
+    age,
+    name
+FROM person
+GROUP BY age
+CLUSTER BY age;
+
+SELECT
+    age,
+    name
+FROM person
+GROUP BY age
+HAVING COUNT(age) > 1
+CLUSTER BY age;
diff --git a/test/fixtures/dialects/sparksql/select_cluster_by.yml b/test/fixtures/dialects/sparksql/select_cluster_by.yml
@@ -3,7 +3,7 @@
 # computed by SQLFluff when running the tests. Please run
 # `python test/generate_parse_fixture_yml.py`  to generate them after adding or
 # altering SQL files.
-_hash: ff198ba535c84b801c569c815830e92738a537853f207ad77facffe3412aa277
+_hash: 62fd488eda564c95d2b693d96671c83f6ebcd6f2966999ea091f135b985f96d2
 file:
 - statement:
     select_statement:
@@ -129,3 +129,109 @@ file:
                 literal: '1'
             - end_bracket: )
 - statement_terminator: ;
+- statement:
+    select_statement:
+      select_clause:
+      - keyword: SELECT
+      - select_clause_element:
+          column_reference:
+            identifier: age
+      - comma: ','
+      - select_clause_element:
+          column_reference:
+            identifier: name
+      from_clause:
+        keyword: FROM
+        from_expression:
+          from_expression_element:
+            table_expression:
+              table_reference:
+                identifier: person
+      where_clause:
+        keyword: WHERE
+        expression:
+          column_reference:
+            identifier: age
+          comparison_operator:
+          - raw_comparison_operator: <
+          - raw_comparison_operator: '='
+          literal: '100'
+      cluster_by_clause:
+      - keyword: CLUSTER
+      - keyword: BY
+      - column_reference:
+          identifier: age
+- statement_terminator: ;
+- statement:
+    select_statement:
+      select_clause:
+      - keyword: SELECT
+      - select_clause_element:
+          column_reference:
+            identifier: age
+      - comma: ','
+      - select_clause_element:
+          column_reference:
+            identifier: name
+      from_clause:
+        keyword: FROM
+        from_expression:
+          from_expression_element:
+            table_expression:
+              table_reference:
+                identifier: person
+      groupby_clause:
+      - keyword: GROUP
+      - keyword: BY
+      - column_reference:
+          identifier: age
+      cluster_by_clause:
+      - keyword: CLUSTER
+      - keyword: BY
+      - column_reference:
+          identifier: age
+- statement_terminator: ;
+- statement:
+    select_statement:
+      select_clause:
+      - keyword: SELECT
+      - select_clause_element:
+          column_reference:
+            identifier: age
+      - comma: ','
+      - select_clause_element:
+          column_reference:
+            identifier: name
+      from_clause:
+        keyword: FROM
+        from_expression:
+          from_expression_element:
+            table_expression:
+              table_reference:
+                identifier: person
+      groupby_clause:
+      - keyword: GROUP
+      - keyword: BY
+      - column_reference:
+          identifier: age
+      having_clause:
+        keyword: HAVING
+        expression:
+          function:
+            function_name:
+              function_name_identifier: COUNT
+            bracketed:
+              start_bracket: (
+              expression:
+                column_reference:
+                  identifier: age
+              end_bracket: )
+          comparison_operator:
+            raw_comparison_operator: '>'
+          literal: '1'
+      cluster_by_clause:
+      - keyword: CLUSTER
+      - keyword: BY
+      - column_reference:
+          identifier: age
+- statement_terminator: ;
diff --git a/test/fixtures/dialects/sparksql/select_distribute_by.sql b/test/fixtures/dialects/sparksql/select_distribute_by.sql
@@ -28,3 +28,25 @@ SELECT
 FROM person
 DISTRIBUTE BY
     LEFT(SUBSTRING_INDEX(name, ' ', -1), 1);
+
+SELECT
+    age,
+    name
+FROM person
+WHERE age <= 100
+DISTRIBUTE BY age;
+
+SELECT
+    age,
+    name
+FROM person
+GROUP BY age
+DISTRIBUTE BY age;
+
+SELECT
+    age,
+    name
+FROM person
+GROUP BY age
+HAVING COUNT(age) > 1
+DISTRIBUTE BY age;