Add Databricks as a distinct dialect (#4438)

Co-authored-by: Alan Cruickshank <alan@designingoverload.com>
sqlfluff · Feb 27, 2023 · 009725c · 009725c
1 parent b600627
commit 009725c
Show file tree

Hide file tree

Showing 15 changed files with 102 additions and 25 deletions.
diff --git a/.github/workflows/add-issue-labels.yaml b/.github/workflows/add-issue-labels.yaml
@@ -10,5 +10,5 @@ jobs:
       - uses: Naturalclar/issue-action@v2.0.2
         with:
           title-or-body: "title"
-          parameters: '[{"keywords": ["ansi"], "labels": ["ansi"]}, {"keywords": ["athena"], "labels": ["athena"]}, {"keywords": ["bigquery"], "labels": ["bigquery"]}, {"keywords": ["clickhouse"], "labels": ["clickhouse"]}, {"keywords": ["db2"], "labels": ["db2"]}, {"keywords": ["duckdb"], "labels": ["duckdb"]}, {"keywords": ["exasol"], "labels": ["exasol"]}, {"keywords": ["hive"], "labels": ["hive"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["oracle"], "labels": ["oracle"]}, {"keywords": ["postgres"], "labels": ["postgres"]}, {"keywords": ["redshift"], "labels": ["redshift"]}, {"keywords": ["snowflake"], "labels": ["snowflake"]}, {"keywords": ["soql"], "labels": ["soql"]}, {"keywords": ["sparksql"], "labels": ["sparksql"]}, {"keywords": ["sqlite"], "labels": ["sqlite"]}, {"keywords": ["t-sql", "tsql"], "labels": ["t-sql"]}, {"keywords": ["teradata"], "labels": ["teradata"]}]'
+          parameters: '[{"keywords": ["ansi"], "labels": ["ansi"]}, {"keywords": ["athena"], "labels": ["athena"]}, {"keywords": ["bigquery"], "labels": ["bigquery"]}, {"keywords": ["clickhouse"], "labels": ["clickhouse"]}, {"keywords": ["databricks"], "labels": ["databricks"]}, {"keywords": ["db2"], "labels": ["db2"]}, {"keywords": ["duckdb"], "labels": ["duckdb"]}, {"keywords": ["exasol"], "labels": ["exasol"]}, {"keywords": ["hive"], "labels": ["hive"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["oracle"], "labels": ["oracle"]}, {"keywords": ["postgres"], "labels": ["postgres"]}, {"keywords": ["redshift"], "labels": ["redshift"]}, {"keywords": ["snowflake"], "labels": ["snowflake"]}, {"keywords": ["soql"], "labels": ["soql"]}, {"keywords": ["sparksql"], "labels": ["sparksql"]}, {"keywords": ["sqlite"], "labels": ["sqlite"]}, {"keywords": ["t-sql", "tsql"], "labels": ["t-sql"]}, {"keywords": ["teradata"], "labels": ["teradata"]}]'
           github-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/docs/source/dialects.rst b/docs/source/dialects.rst
@@ -84,19 +84,9 @@ The dialect for `ClickHouse`_.
 Databricks
 ----------
 
-The dialect `Databricks`_ is an alias for the :ref:`sparksql_dialect_ref`.
-
-Since Databricks `builds on top of`_ Apache Spark, the Spark SQL dialect
-holds most of the definitions of common commands and structures.
-
-Specifics to Databricks, such as Delta Live Table syntax, are added to the
-Spark SQL dialect to simplify implementation and prevent code duplication
-for minor syntax updates. This follows SQLFluff's philosophy of not being
-strict in adhering to dialect specifications to permit slightly wider set
-of functions than actually available in a given dialect.
+The dialect `Databricks`_.
 
 .. _`Databricks`: https://databricks.com/
-.. _`builds on top of` : https://www.databricks.com/spark/comparing-databricks-to-apache-spark
 
 .. _db2_dialect_ref:
 

diff --git a/plugins/sqlfluff-templater-dbt/setup.cfg b/plugins/sqlfluff-templater-dbt/setup.cfg
@@ -43,6 +43,7 @@ keywords =
     formatter
     bigquery
     clickhouse
+    databricks
     db2
     duckdb
     exasol

diff --git a/setup.cfg b/setup.cfg
@@ -45,6 +45,7 @@ keywords =
     athena
     bigquery
     clickhouse
+    databricks
     db2
     duckdb
     exasol

diff --git a/src/sqlfluff/core/dialects/__init__.py b/src/sqlfluff/core/dialects/__init__.py
@@ -24,7 +24,7 @@
     "athena": ("dialect_athena", "athena_dialect"),
     "bigquery": ("dialect_bigquery", "bigquery_dialect"),
     "clickhouse": ("dialect_clickhouse", "clickhouse_dialect"),
-    "databricks": ("dialect_sparksql", "sparksql_dialect"),
+    "databricks": ("dialect_databricks", "databricks_dialect"),
     "db2": ("dialect_db2", "db2_dialect"),
     "duckdb": ("dialect_duckdb", "duckdb_dialect"),
     "exasol": ("dialect_exasol", "exasol_dialect"),

diff --git a/src/sqlfluff/dialects/dialect_databricks.py b/src/sqlfluff/dialects/dialect_databricks.py
@@ -0,0 +1,20 @@
+"""The Databricks Dialect.
+
+Functionally, it is quite similar to SparkSQL,
+however it's much less strict on keywords.
+It also has some extensions.
+"""
+
+from sqlfluff.core.dialects import load_raw_dialect
+
+from sqlfluff.dialects.dialect_databricks_keywords import RESERVED_KEYWORDS
+
+sparksql_dialect = load_raw_dialect("sparksql")
+databricks_dialect = sparksql_dialect.copy_as("databricks")
+
+databricks_dialect.sets("unreserved_keywords").update(
+    sparksql_dialect.sets("reserved_keywords")
+)
+databricks_dialect.sets("unreserved_keywords").difference_update(RESERVED_KEYWORDS)
+databricks_dialect.sets("reserved_keywords").clear()
+databricks_dialect.sets("reserved_keywords").update(RESERVED_KEYWORDS)
diff --git a/src/sqlfluff/dialects/dialect_databricks_keywords.py b/src/sqlfluff/dialects/dialect_databricks_keywords.py
@@ -0,0 +1,20 @@
+"""A list of databricks reserved keywords."""
+
+RESERVED_KEYWORDS = [
+    "ANTI",
+    "CROSS",
+    "EXCEPT",
+    "FULL",
+    "INNER",
+    "INTERSECT",
+    "JOIN",
+    "LATERAL",
+    "LEFT",
+    "MINUS",
+    "NATURAL",
+    "ON",
+    "RIGHT",
+    "SEMI",
+    "UNION",
+    "USING",
+]
diff --git a/src/sqlfluff/rules/L026.py b/src/sqlfluff/rules/L026.py
@@ -39,9 +39,11 @@ class Rule_L026(BaseRule):
     """References cannot reference objects not present in ``FROM`` clause.
 
     .. note::
-       This rule is disabled by default for BigQuery, Hive, Redshift, SOQL, and SparkSQL
-       due to the support of things like structs and lateral views which trigger false
-       positives. It can be enabled with the ``force_enable = True`` flag.
+
+       This rule is disabled by default for BigQuery, Databricks, Hive,
+       Redshift, SOQL and SparkSQL due to the support of things like
+       structs and lateral views which trigger false positives. It can be
+       enabled with the ``force_enable = True`` flag.
 
     **Anti-pattern**
 
@@ -68,7 +70,14 @@ class Rule_L026(BaseRule):
     groups = ("all", "core")
     config_keywords = ["force_enable"]
     crawl_behaviour = SegmentSeekerCrawler(set(_START_TYPES))
-    _dialects_disabled_by_default = ["bigquery", "hive", "redshift", "soql", "sparksql"]
+    _dialects_disabled_by_default = [
+        "bigquery",
+        "databricks",
+        "hive",
+        "redshift",
+        "soql",
+        "sparksql",
+    ]
 
     def _eval(self, context: RuleContext) -> EvalResultType:
         # Config type hints

diff --git a/src/sqlfluff/rules/L057.py b/src/sqlfluff/rules/L057.py
@@ -131,7 +131,7 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
                     identifier = identifier[:-1]
                 identifier = identifier.replace(".", "")
 
-            # SparkSQL file references for direct file query
+            # Databricks & SparkSQL file references for direct file query
             # are quoted in back ticks to allow for identifiers common
             # in file paths and regex patterns for path globbing
             # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
@@ -140,8 +140,11 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
             # https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter
             #
 
-            if context.dialect.name in ["sparksql"] and context.parent_stack:
-                # SparkSQL file references for direct file query
+            if (
+                context.dialect.name in ["databricks", "sparksql"]
+                and context.parent_stack
+            ):
+                # Databricks & SparkSQL file references for direct file query
                 # are quoted in back ticks to allow for identifiers common
                 # in file paths and regex patterns for path globbing
                 # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
@@ -152,7 +155,8 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
                 if context.parent_stack[-1].is_type("file_reference"):
                     return None
 
-                # SparkSQL properties keys used for setting table and runtime
+                # Databricks & SparkSQL properties keys
+                # used for setting table and runtime
                 # configurations denote namespace using dots, so these are
                 # removed before testing L057 to not trigger false positives
                 # Runtime configurations:

diff --git a/src/sqlfluff/rules/L064.py b/src/sqlfluff/rules/L064.py
@@ -26,7 +26,8 @@ class Rule_L064(BaseRule):
        cannot interchange single and double quotes
 
        This rule is only enabled for dialects that allow single *and* double quotes for
-       quoted literals (currently ``bigquery``, ``hive``, ``mysql``, ``sparksql``).
+       quoted literals
+       (currently ``bigquery``, ``databricks``, ``hive``, ``mysql``, ``sparksql``).
        It can be enabled for other dialects with the ``force_enable = True`` flag.
 
     **Anti-pattern**
@@ -65,6 +66,7 @@ class Rule_L064(BaseRule):
     is_fix_compatible = True
     _dialects_with_double_quoted_strings = [
         "bigquery",
+        "databricks",
         "hive",
         "mysql",
         "sparksql",

diff --git a/src/sqlfluff/rules/capitalisation/CP02.py b/src/sqlfluff/rules/capitalisation/CP02.py
@@ -82,7 +82,7 @@ def _eval(self, context: RuleContext) -> Optional[List[LintResult]]:
         # Data Feed
         # https://docs.delta.io/2.0.0/delta-change-data-feed.html#enable-change-data-feed
         if (
-            context.dialect.name in ["sparksql"]
+            context.dialect.name in ["databricks", "sparksql"]
             and context.parent_stack
             and context.parent_stack[-1].type == "property_name_identifier"
             and context.segment.raw == "enableChangeDataFeed"

diff --git a/test/fixtures/dialects/databricks/.sqlfluff b/test/fixtures/dialects/databricks/.sqlfluff
@@ -0,0 +1,2 @@
+[sqlfluff]
+dialect = databricks
diff --git a/test/fixtures/dialects/databricks/databricks_keywords.sql b/test/fixtures/dialects/databricks/databricks_keywords.sql
@@ -0,0 +1,3 @@
+select *
+from shopify_cz.order
+;
diff --git a/test/fixtures/dialects/databricks/databricks_keywords.yml b/test/fixtures/dialects/databricks/databricks_keywords.yml
@@ -0,0 +1,25 @@
+# YML test files are auto-generated from SQL files and should not be edited by
+# hand. To help enforce this, the "hash" field in the file must match a hash
+# computed by SQLFluff when running the tests. Please run
+# `python test/generate_parse_fixture_yml.py`  to generate them after adding or
+# altering SQL files.
+_hash: bf986346fed8101687984158446015a57adda6a314601c4bd98977bd5a5c3a8b
+file:
+  statement:
+    select_statement:
+      select_clause:
+        keyword: select
+        select_clause_element:
+          wildcard_expression:
+            wildcard_identifier:
+              star: '*'
+      from_clause:
+        keyword: from
+        from_expression:
+          from_expression_element:
+            table_expression:
+              table_reference:
+              - naked_identifier: shopify_cz
+              - dot: .
+              - naked_identifier: order
+  statement_terminator: ;
diff --git a/test/fixtures/rules/std_rule_cases/CP02.yml b/test/fixtures/rules/std_rule_cases/CP02.yml
@@ -248,8 +248,8 @@ test_pass_bigquery_safe_does_not_trigger:
     core:
       dialect: bigquery
 
-test_pass_sparksql_case_sensitive_property:
+test_pass_databricks_case_sensitive_property:
   pass_str: SET spark.databricks.delta.properties.defaults.enableChangeDataFeed = true;
   configs:
     core:
-      dialect: sparksql
+      dialect: databricks