Skip to content

Commit

Permalink
Add Databricks as a distinct dialect (#4438)
Browse files Browse the repository at this point in the history
Co-authored-by: Alan Cruickshank <alan@designingoverload.com>
  • Loading branch information
WittierDinosaur and alanmcruickshank committed Feb 27, 2023
1 parent b600627 commit 009725c
Show file tree
Hide file tree
Showing 15 changed files with 102 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/add-issue-labels.yaml
Expand Up @@ -10,5 +10,5 @@ jobs:
- uses: Naturalclar/issue-action@v2.0.2
with:
title-or-body: "title"
parameters: '[{"keywords": ["ansi"], "labels": ["ansi"]}, {"keywords": ["athena"], "labels": ["athena"]}, {"keywords": ["bigquery"], "labels": ["bigquery"]}, {"keywords": ["clickhouse"], "labels": ["clickhouse"]}, {"keywords": ["db2"], "labels": ["db2"]}, {"keywords": ["duckdb"], "labels": ["duckdb"]}, {"keywords": ["exasol"], "labels": ["exasol"]}, {"keywords": ["hive"], "labels": ["hive"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["oracle"], "labels": ["oracle"]}, {"keywords": ["postgres"], "labels": ["postgres"]}, {"keywords": ["redshift"], "labels": ["redshift"]}, {"keywords": ["snowflake"], "labels": ["snowflake"]}, {"keywords": ["soql"], "labels": ["soql"]}, {"keywords": ["sparksql"], "labels": ["sparksql"]}, {"keywords": ["sqlite"], "labels": ["sqlite"]}, {"keywords": ["t-sql", "tsql"], "labels": ["t-sql"]}, {"keywords": ["teradata"], "labels": ["teradata"]}]'
parameters: '[{"keywords": ["ansi"], "labels": ["ansi"]}, {"keywords": ["athena"], "labels": ["athena"]}, {"keywords": ["bigquery"], "labels": ["bigquery"]}, {"keywords": ["clickhouse"], "labels": ["clickhouse"]}, {"keywords": ["databricks"], "labels": ["databricks"]}, {"keywords": ["db2"], "labels": ["db2"]}, {"keywords": ["duckdb"], "labels": ["duckdb"]}, {"keywords": ["exasol"], "labels": ["exasol"]}, {"keywords": ["hive"], "labels": ["hive"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["mysql"], "labels": ["mysql"]}, {"keywords": ["oracle"], "labels": ["oracle"]}, {"keywords": ["postgres"], "labels": ["postgres"]}, {"keywords": ["redshift"], "labels": ["redshift"]}, {"keywords": ["snowflake"], "labels": ["snowflake"]}, {"keywords": ["soql"], "labels": ["soql"]}, {"keywords": ["sparksql"], "labels": ["sparksql"]}, {"keywords": ["sqlite"], "labels": ["sqlite"]}, {"keywords": ["t-sql", "tsql"], "labels": ["t-sql"]}, {"keywords": ["teradata"], "labels": ["teradata"]}]'
github-token: "${{ secrets.GITHUB_TOKEN }}"
12 changes: 1 addition & 11 deletions docs/source/dialects.rst
Expand Up @@ -84,19 +84,9 @@ The dialect for `ClickHouse`_.
Databricks
----------

The dialect `Databricks`_ is an alias for the :ref:`sparksql_dialect_ref`.

Since Databricks `builds on top of`_ Apache Spark, the Spark SQL dialect
holds most of the definitions of common commands and structures.

Specifics to Databricks, such as Delta Live Table syntax, are added to the
Spark SQL dialect to simplify implementation and prevent code duplication
for minor syntax updates. This follows SQLFluff's philosophy of not being
strict in adhering to dialect specifications to permit slightly wider set
of functions than actually available in a given dialect.
The dialect `Databricks`_.

.. _`Databricks`: https://databricks.com/
.. _`builds on top of` : https://www.databricks.com/spark/comparing-databricks-to-apache-spark

.. _db2_dialect_ref:

Expand Down
1 change: 1 addition & 0 deletions plugins/sqlfluff-templater-dbt/setup.cfg
Expand Up @@ -43,6 +43,7 @@ keywords =
formatter
bigquery
clickhouse
databricks
db2
duckdb
exasol
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Expand Up @@ -45,6 +45,7 @@ keywords =
athena
bigquery
clickhouse
databricks
db2
duckdb
exasol
Expand Down
2 changes: 1 addition & 1 deletion src/sqlfluff/core/dialects/__init__.py
Expand Up @@ -24,7 +24,7 @@
"athena": ("dialect_athena", "athena_dialect"),
"bigquery": ("dialect_bigquery", "bigquery_dialect"),
"clickhouse": ("dialect_clickhouse", "clickhouse_dialect"),
"databricks": ("dialect_sparksql", "sparksql_dialect"),
"databricks": ("dialect_databricks", "databricks_dialect"),
"db2": ("dialect_db2", "db2_dialect"),
"duckdb": ("dialect_duckdb", "duckdb_dialect"),
"exasol": ("dialect_exasol", "exasol_dialect"),
Expand Down
20 changes: 20 additions & 0 deletions src/sqlfluff/dialects/dialect_databricks.py
@@ -0,0 +1,20 @@
"""The Databricks Dialect.
Functionally, it is quite similar to SparkSQL,
however it's much less strict on keywords.
It also has some extensions.
"""

from sqlfluff.core.dialects import load_raw_dialect

from sqlfluff.dialects.dialect_databricks_keywords import RESERVED_KEYWORDS

sparksql_dialect = load_raw_dialect("sparksql")
databricks_dialect = sparksql_dialect.copy_as("databricks")

databricks_dialect.sets("unreserved_keywords").update(
sparksql_dialect.sets("reserved_keywords")
)
databricks_dialect.sets("unreserved_keywords").difference_update(RESERVED_KEYWORDS)
databricks_dialect.sets("reserved_keywords").clear()
databricks_dialect.sets("reserved_keywords").update(RESERVED_KEYWORDS)
20 changes: 20 additions & 0 deletions src/sqlfluff/dialects/dialect_databricks_keywords.py
@@ -0,0 +1,20 @@
"""A list of databricks reserved keywords."""

RESERVED_KEYWORDS = [
"ANTI",
"CROSS",
"EXCEPT",
"FULL",
"INNER",
"INTERSECT",
"JOIN",
"LATERAL",
"LEFT",
"MINUS",
"NATURAL",
"ON",
"RIGHT",
"SEMI",
"UNION",
"USING",
]
17 changes: 13 additions & 4 deletions src/sqlfluff/rules/L026.py
Expand Up @@ -39,9 +39,11 @@ class Rule_L026(BaseRule):
"""References cannot reference objects not present in ``FROM`` clause.
.. note::
This rule is disabled by default for BigQuery, Hive, Redshift, SOQL, and SparkSQL
due to the support of things like structs and lateral views which trigger false
positives. It can be enabled with the ``force_enable = True`` flag.
This rule is disabled by default for BigQuery, Databricks, Hive,
Redshift, SOQL and SparkSQL due to the support of things like
structs and lateral views which trigger false positives. It can be
enabled with the ``force_enable = True`` flag.
**Anti-pattern**
Expand All @@ -68,7 +70,14 @@ class Rule_L026(BaseRule):
groups = ("all", "core")
config_keywords = ["force_enable"]
crawl_behaviour = SegmentSeekerCrawler(set(_START_TYPES))
_dialects_disabled_by_default = ["bigquery", "hive", "redshift", "soql", "sparksql"]
_dialects_disabled_by_default = [
"bigquery",
"databricks",
"hive",
"redshift",
"soql",
"sparksql",
]

def _eval(self, context: RuleContext) -> EvalResultType:
# Config type hints
Expand Down
12 changes: 8 additions & 4 deletions src/sqlfluff/rules/L057.py
Expand Up @@ -131,7 +131,7 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
identifier = identifier[:-1]
identifier = identifier.replace(".", "")

# SparkSQL file references for direct file query
# Databricks & SparkSQL file references for direct file query
# are quoted in back ticks to allow for identifiers common
# in file paths and regex patterns for path globbing
# https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
Expand All @@ -140,8 +140,11 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
# https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html#path-global-filter
#

if context.dialect.name in ["sparksql"] and context.parent_stack:
# SparkSQL file references for direct file query
if (
context.dialect.name in ["databricks", "sparksql"]
and context.parent_stack
):
# Databricks & SparkSQL file references for direct file query
# are quoted in back ticks to allow for identifiers common
# in file paths and regex patterns for path globbing
# https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-file.html
Expand All @@ -152,7 +155,8 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
if context.parent_stack[-1].is_type("file_reference"):
return None

# SparkSQL properties keys used for setting table and runtime
# Databricks & SparkSQL properties keys
# used for setting table and runtime
# configurations denote namespace using dots, so these are
# removed before testing L057 to not trigger false positives
# Runtime configurations:
Expand Down
4 changes: 3 additions & 1 deletion src/sqlfluff/rules/L064.py
Expand Up @@ -26,7 +26,8 @@ class Rule_L064(BaseRule):
cannot interchange single and double quotes
This rule is only enabled for dialects that allow single *and* double quotes for
quoted literals (currently ``bigquery``, ``hive``, ``mysql``, ``sparksql``).
quoted literals
(currently ``bigquery``, ``databricks``, ``hive``, ``mysql``, ``sparksql``).
It can be enabled for other dialects with the ``force_enable = True`` flag.
**Anti-pattern**
Expand Down Expand Up @@ -65,6 +66,7 @@ class Rule_L064(BaseRule):
is_fix_compatible = True
_dialects_with_double_quoted_strings = [
"bigquery",
"databricks",
"hive",
"mysql",
"sparksql",
Expand Down
2 changes: 1 addition & 1 deletion src/sqlfluff/rules/capitalisation/CP02.py
Expand Up @@ -82,7 +82,7 @@ def _eval(self, context: RuleContext) -> Optional[List[LintResult]]:
# Data Feed
# https://docs.delta.io/2.0.0/delta-change-data-feed.html#enable-change-data-feed
if (
context.dialect.name in ["sparksql"]
context.dialect.name in ["databricks", "sparksql"]
and context.parent_stack
and context.parent_stack[-1].type == "property_name_identifier"
and context.segment.raw == "enableChangeDataFeed"
Expand Down
2 changes: 2 additions & 0 deletions test/fixtures/dialects/databricks/.sqlfluff
@@ -0,0 +1,2 @@
[sqlfluff]
dialect = databricks
3 changes: 3 additions & 0 deletions test/fixtures/dialects/databricks/databricks_keywords.sql
@@ -0,0 +1,3 @@
select *
from shopify_cz.order
;
25 changes: 25 additions & 0 deletions test/fixtures/dialects/databricks/databricks_keywords.yml
@@ -0,0 +1,25 @@
# YML test files are auto-generated from SQL files and should not be edited by
# hand. To help enforce this, the "hash" field in the file must match a hash
# computed by SQLFluff when running the tests. Please run
# `python test/generate_parse_fixture_yml.py` to generate them after adding or
# altering SQL files.
_hash: bf986346fed8101687984158446015a57adda6a314601c4bd98977bd5a5c3a8b
file:
statement:
select_statement:
select_clause:
keyword: select
select_clause_element:
wildcard_expression:
wildcard_identifier:
star: '*'
from_clause:
keyword: from
from_expression:
from_expression_element:
table_expression:
table_reference:
- naked_identifier: shopify_cz
- dot: .
- naked_identifier: order
statement_terminator: ;
4 changes: 2 additions & 2 deletions test/fixtures/rules/std_rule_cases/CP02.yml
Expand Up @@ -248,8 +248,8 @@ test_pass_bigquery_safe_does_not_trigger:
core:
dialect: bigquery

test_pass_sparksql_case_sensitive_property:
test_pass_databricks_case_sensitive_property:
pass_str: SET spark.databricks.delta.properties.defaults.enableChangeDataFeed = true;
configs:
core:
dialect: sparksql
dialect: databricks

0 comments on commit 009725c

Please sign in to comment.