Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core: Add MultiStringParser to match a collection of strings #3510

Merged
merged 10 commits into from Jun 30, 2022
8 changes: 7 additions & 1 deletion src/sqlfluff/core/parser/__init__.py
Expand Up @@ -31,7 +31,12 @@
OptionallyBracketed,
Conditional,
)
from sqlfluff.core.parser.parsers import StringParser, NamedParser, RegexParser
from sqlfluff.core.parser.parsers import (
StringParser,
NamedParser,
RegexParser,
MultiStringParser,
)
from sqlfluff.core.parser.markers import PositionMarker
from sqlfluff.core.parser.lexer import Lexer, StringLexer, RegexLexer
from sqlfluff.core.parser.parser import Parser
Expand Down Expand Up @@ -65,6 +70,7 @@
"OptionallyBracketed",
"Conditional",
"StringParser",
"MultiStringParser",
"NamedParser",
"RegexParser",
"PositionMarker",
Expand Down
43 changes: 42 additions & 1 deletion src/sqlfluff/core/parser/parsers.py
Expand Up @@ -4,7 +4,7 @@
"""

import regex
from typing import Type, Optional, List, Tuple, Union
from typing import Collection, Type, Optional, List, Tuple, Union

from sqlfluff.core.parser.context import ParseContext
from sqlfluff.core.parser.matchable import Matchable
Expand Down Expand Up @@ -100,6 +100,47 @@ def match(
return MatchResult.from_unmatched(segments)


class MultiStringParser(StringParser):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there some benefit to inheriting from StringParser? I notice that:

  • In the constructor, we delete one of the parent class' fields (template)
  • None of the methods use the superclass

Basically, I'm wondering if this class doesn't need to inherit from StringParser.

Copy link
Contributor Author

@judahrand judahrand Jun 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are other methods which are used from StringParser, for example match. Though, you are right that the inheritance is odd. I reckon there should be a BaseParser perhaps which inherits from Matchable and implements the reusable bits of StringParser given that it looks to me like all other Parsers inherit from StringParser?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had a go at this and keen for feedback.

"""An object which matches and returns raw segments on a collection of strings."""

def __init__(
self,
templates: Collection[str],
raw_class: Type[RawSegment],
name: Optional[str] = None,
type: Optional[str] = None,
optional: bool = False,
**segment_kwargs,
):
self.templates = {template.upper() for template in templates}
super().__init__(
template="",
raw_class=raw_class,
name=name,
type=type,
optional=optional,
**segment_kwargs,
)
# Delete attribute which is replaced by `self.templates` for this `Parser``
del self.template

def simple(self, parse_context: "ParseContext") -> Optional[List[str]]:
"""Return simple options for this matcher.

Because string matchers are not case sensitive we can
just return the templates here.
"""
return list(self.templates)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this function called frequently? I notice that we're creating and returning a new list each time, which could be slow.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compromise here was whether to cache the set which is used for matching or to cache the list which is expected to be returned by simple. We could cache both as class attributes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've implemented a self._simple which is set in __init__ which should avoid creating the list multiple times (this goes for StringParser too).


def _is_first_match(self, segment: BaseSegment):
"""Does the segment provided match according to the current rules."""
# Is the target a match and IS IT CODE.
# The latter stops us accidentally matching comments.
if segment.is_code and segment.raw.upper() in self.templates:
judahrand marked this conversation as resolved.
Show resolved Hide resolved
return True
return False


class NamedParser(StringParser):
"""An object which matches and returns raw segments based on names."""

Expand Down
13 changes: 7 additions & 6 deletions src/sqlfluff/dialects/dialect_ansi.py
Expand Up @@ -48,6 +48,7 @@
StringParser,
SymbolSegment,
WhitespaceSegment,
MultiStringParser,
)
from sqlfluff.core.parser.segments.base import BracketedSegment
from sqlfluff.dialects.dialect_ansi_keywords import (
Expand Down Expand Up @@ -273,8 +274,8 @@
),
# The following functions can be called without parentheses per ANSI specification
BareFunctionSegment=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("bare_functions")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("bare_functions"),
CodeSegment,
name="bare_function",
type="bare_function",
Expand Down Expand Up @@ -318,16 +319,16 @@
),
# Ansi Intervals
DatetimeUnitSegment=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("datetime_units")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("datetime_units"),
CodeSegment,
name="date_part",
type="date_part",
)
),
DatePartFunctionName=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("date_part_function_name")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("date_part_function_name"),
CodeSegment,
name="function_name_identifier",
type="function_name_identifier",
Expand Down
5 changes: 3 additions & 2 deletions src/sqlfluff/dialects/dialect_bigquery.py
Expand Up @@ -35,6 +35,7 @@
StringLexer,
StringParser,
SymbolSegment,
MultiStringParser,
)
from sqlfluff.core.parser.segments.base import BracketedSegment
from sqlfluff.dialects.dialect_bigquery_keywords import (
Expand Down Expand Up @@ -160,8 +161,8 @@
),
),
ExtendedDatetimeUnitSegment=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("extended_datetime_units")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("extended_datetime_units"),
CodeSegment,
name="date_part",
type="date_part",
Expand Down
9 changes: 5 additions & 4 deletions src/sqlfluff/dialects/dialect_exasol.py
Expand Up @@ -29,6 +29,7 @@
StringParser,
RegexParser,
NewlineSegment,
MultiStringParser,
)
from sqlfluff.core.dialects import load_raw_dialect
from sqlfluff.core.parser.segments.generator import SegmentGenerator
Expand Down Expand Up @@ -170,16 +171,16 @@
"escaped_identifier", SymbolSegment, type="identifier"
),
SessionParameterSegment=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("session_parameters")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("session_parameters"),
CodeSegment,
name="session_parameter",
type="session_parameter",
)
),
SystemParameterSegment=SegmentGenerator(
lambda dialect: RegexParser(
r"^(" + r"|".join(dialect.sets("system_parameters")) + r")$",
lambda dialect: MultiStringParser(
dialect.sets("system_parameters"),
CodeSegment,
name="system_parameter",
type="system_parameter",
Expand Down
28 changes: 28 additions & 0 deletions test/core/parser/parser_test.py
@@ -0,0 +1,28 @@
"""The Test file for Parsers (Matchable Classes)."""

from sqlfluff.core.parser import (
KeywordSegment,
MultiStringParser,
)
from sqlfluff.core.parser.context import RootParseContext


def test__parser__multistringparser__match(generate_test_segments):
"""Test the MultiStringParser matchable."""
parser = MultiStringParser(["foo", "bar"], KeywordSegment)
with RootParseContext(dialect=None) as ctx:
# Check directly
seg_list = generate_test_segments(["foo", "fo"])
# Matches when it should
assert parser.match(seg_list[:1], parse_context=ctx).matched_segments == (
KeywordSegment("foo", seg_list[0].pos_marker),
)
# Doesn't match when it shouldn't
assert parser.match(seg_list[1:], parse_context=ctx).matched_segments == tuple()


def test__parser__multistringparser__simple():
"""Test the MultiStringParser matchable."""
parser = MultiStringParser(["foo", "bar"], KeywordSegment)
with RootParseContext(dialect=None) as ctx:
assert parser.simple(ctx)