diff --git a/README.md b/README.md index 85ec712..c348041 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,369 @@ If you know a particular flavor of regex and would like to contribute, feel free - The `invert` function can accept any regular expression, not just EZRegex expressions, if you want to use it independently of the rest of the library. +
+
+ Positionals + +#### These differentiate the *string* starting with a sequence, and a *line* starting with a sequence. Do note that the startof the string is also the start of a line. These can also be called without parameters to denote the start/end of astring/line without something specific having to be next to it. +- string_starts_with +- string_ends_with +- line_starts_with +- line_ends_with +- word_boundary + - Matches the boundary of a word, i.e. the empty space between a word character and not a word character, or the end of a string +- not_word_boundary + - The opposite of `wordBoundary` + +
+ +
+ Literals + +#### +- tab +- space +- space_or_tab +- new_line +- carriage_return +- quote + - Matches ', ", and ` +- vertical_tab +- form_feed +- comma +- period +- underscore + +
+ +
+ Not Literals + +#### +- not_whitespace +- not_digit +- not_word + +
+ +
+ Catagories + +#### +- whitespace +- whitechunk + - A "chunk" of whitespace. Just any amount of whitespace together +- digit +- number + - Matches multiple digits next to each other. Does not match negatives or decimals +- word +- word_char + - Matches just a single "word character", defined as any letter, number, or _ +- anything + - Matches any single character, except a newline. To also match a newline, use literally_anything +- chunk + - A "chunk": Any clump of characters up until the next newline +- uppercase +- lowercase +- letter + - Matches just a letter -- not numbers or _ like word_char +- hex_digit +- oct_digit +- punctuation +- controller + - Matches a metadata ASCII characters +- printable + - Matches printable ASCII characters +- printable_and_space +- alpha_num +- unicode + - Matches a unicode character by name +- any_between(char: str, and_char: str) + - Match any char between `char` and `and_char`, using the ASCII table for reference + +
+ +
+ Amounts + +#### +- match_max(input: InputType) + - Match as many of `input` in the string as you can. This is equivelent to using the unary + operator. +If `input` is not provided, it works on the previous regex pattern. That's not recommended for +clarity's sake though +- match_num(num: int, input: InputType) + - Match `num` amount of `input` in the string +- more_than(min: int, input: InputType) + - Match more than `min` sequences of `input` in the string +- at_least(min: int, input: InputType) + - Match at least `min` sequences of `input` in the string +- at_most(max: int, input: InputType) + - Match at most `max` instances of `input` in the string +- between(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) + - Match between `min` and `max` sequences of `input` in the string. This also accepts `greedy` and `possessive` parameters +Max can be an empty string to indicate no maximum +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) + - Match at least one of `input` in the string. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) + - Match 0 or more sequences of `input`. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help + +
+ +
+ Choices + +#### +- optional(input: InputType, greedy: bool=True, possessive: bool=False) + - Match `input` if it's there. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- either(input: InputType, or_input: InputType) +- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) + - Match any of the given `inputs`. Note that `inputs` can be multiple parameters, +or a single string. Can also accept parameters chars and split. If char is set +to True, then `inputs` must only be a single string, it interprets `inputs` +as characters, and splits it up to find any of the chars in the string. If +split is set to true, it forces the ?(...) regex syntax instead of the [...] +syntax. It should act the same way, but your output regex will look different. +By default, it just optimizes it for you. +- any_char_except(*inputs: str) + - This matches any char that is NOT in `inputs`. `inputs` can be multiple parameters, or a single string of chars to split. +- any_except(input: InputType, type: InputType='.*') + - Matches anything other than `input`, which must be a single string or EZRegex chain, **not** a list. Also +optionally accepts the `type` parameter, which works like this: "Match any `type` other than `input`". For example, +"match any word which is not foo". Do note that this function is new, and I'm still working out the kinks. +- each(*inputs: InputType) + - Matches if the next part of the string can match all of the given inputs. Like the + operator, but out of order. + +
+ +
+ Conditionals + +#### These can only be used once in a given expression. They only match a given expression if the expression is/ins'tfollowed/preceeded by a the given pattern +- if_proceded_by(input: InputType) + - Matches the pattern if it has `input` coming after it. Can only be used once in a given pattern, +as it only applies to the end +- if_not_proceded_by(input: InputType) + - Matches the pattern if it does **not** have `input` coming after it. Can only be used once in +a given pattern, as it only applies to the end +- if_preceded_by(input: InputType) + - Matches the pattern if it has `input` coming before it. Can only be used once in a given pattern, +as it only applies to the beginning +- if_not_preceded_by(input: InputType) + - Matches the pattern if it does **not** have `input` coming before it. Can only be used once +in a given pattern, as it only applies to the beginning +- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) + - Matches if the string has `open`, then `stuff`, then `close`, but only "matches" +stuff. Just a convenience combination of ifProceededBy and ifPreceededBy. + +
+ +
+ Grouping + +#### +- group(input: InputType, name: str | None=None) + - Causes `input` to be captured as an unnamed group. Only useful when replacing regexs +- passive_group(input: InputType) + - As all regexs in EZRegex capture passively, this is entirely useless. But if you really want to, here it is +- earlier_group(num_or_name: int | str) + - Matches whatever the group referenced by `num_or_name` matched earlier. Must be *after* a +group which would match `num_or_name` +- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) + - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt` + +
+ +
+ Replacement + +#### In the intrest of "I don't want to think about any syntax at all", I have included replace members. Do note that theyare not interoperable with the other EZRegexs, and can only be used with other strings and each other. +- rgroup(num_or_name: str | int) + - Puts in its place the group specified, either by group number (for unnamed +groups) or group name (for named groups). Named groups are also counted by +number, I'm pretty sure. Groups are numbered starting from 1 +- replace_entire + - Puts in its place the entire match +- replace(string: str, rtn_str: bool=True) -> str | EZRegex + - Generates a valid regex replacement string, using Python f-string like syntax. + +Example: + ``` replace("named: {group}, numbered: {1}, entire: {0}") ``` + +Like Python f-strings, use {{ and }} to specify { and } + +Set the `rtn_str` parameter to True to have it return an EZRegex type instead of a string + +Note: Remember that index 0 is the entire match + +There's a few of advantages to using this instead of just the regular regex replacement syntax: +- It's consistent between dialects +- It's closer to Python f-string syntax, which is cleaner and more familiar +- It handles numbered, named, and entire replacement types the same + +
+ +
+ Misc + +#### +- is_exactly(input: InputType) + - This matches the string if and only if the entire string is exactly equal to `input` +- literal(input: InputType) + - This is a redundant function. You should always be able to use `... + 'stuff'` just as easily as `... + literal('stuff')` +- raw(regex: str) + - If you already have some regular regex written, and you want to incorperate +it, this will allow you to include it without sanatizing all the backslaches +and such, which all the other EZRegexs do automatically + +
+ +
+ Premade + +#### These are some useful combinations that may be commonly used. They are not as stable, and may be changed and added to inlater versions to make them more accurate +- literally_anything + - *Any* character, include newline +- signed + - a signed number, including 123, -123, and +123 +- unsigned + - Same as number. Will not match +123 +- plain_float + - Will match 123.45 and 123. +- full_float + - Will match plain_float as well as things like 1.23e-10 and 1.23e+10 +- int_or_float +- ow + - "Optional Whitechunk" + +
+ +
+ Flags + +#### These shadow python regex flags, and can just as easily be specified directly to the re library instead. They're providedhere for compatibility with other regex dialects. See https://docs.python.org/3/library/re.html#flags for details +- perty +def ASCII(self) +- perty +def IGNORECASE(self) +- perty +def DOTALL(self) +- perty +def LOCALE(self) +- perty +def MULTILINE(self) +- perty +def UNICODE(self) +- __init__(self, definition: EZRegexDefinition, *, sanatize: bool=True, replacement: bool=False, flags: str='') -> None + - The workhorse of the EZRegex library. This represents a regex pattern that can be combined +with other EZRegexs and strings. Ideally, this should only be called internally, but it should +still work from the user's end +- _flag_func(self, final: str) -> str +- _escape(self, pattern: str) -> str + - This function was modified from the one in /usr/lib64/python3.12/re/__init__.py line 255 +- _sanitizeInput(self, i: InputType, addFlags: bool=False) -> str + - Instead of rasising an error if passed a strange datatype, it now trys to cast it to a string +- _compile(self, addFlags=True) -> str +- _copy(self, definition: EZRegexDefinition=..., sanatize: bool=..., replacement: bool=..., flags: str=...) +- compile(self, addFlags=True) -> re.Pattern +- str(self) -> str +- debug(self) +- copy(self, addFlags=True) -> None +- test(self, testString: str=None, show=True, context=True) -> bool + - Tests the current regex expression to see if it's in @param testString. +Returns the match objects (None if there was no match) +- invert(self, amt=1, **kwargs) -> str +- inverse(self, amt=1, **kwargs) -> str + - "Inverts" the current Regex expression to give an example of a string it would match. +Useful for debugging purposes. +- group(self, name: str=None) +- named(self, name: str) +- perty +def unnamed(self) +- if_not_preceded_by(self, input: InputType) +- if_preceded_by(self, input: InputType) +- if_not_proceded_by(self, input: InputType) +- if_proceded_by(self, input: InputType) +- if_not_followed_by(self, input: InputType) +- if_followed_by(self, input: InputType) +- if_enclosed_with(self, open: str, closed: str | None=None) +- perty +def optional(self) +- perty +def repeat(self) +- perty +def exactly(self) +- at_least(self, min: int) +- more_than(self, min: int) +- amt(self, amt: int) +- at_most(self, max: int) +- between(self, min: int, max: int, greedy: bool=True, possessive: bool=False) +- at_least_one(self, greedy: bool=True, possessive: bool=False) +- at_least_none(self, greedy: bool=True, possessive: bool=False) +- or_(self, input: InputType) +- append(self, input: InputType) +- prepend(self, input: InputType) +- perty +def flags(self) -> str +- set_flags(self, to: str) +- add_flag(self, flag: str) +- remove_flag(self, flag: str) +- __call__(self, *args, **kwargs) -> EZRegex | str + - This should be called by the user to specify the specific parameters of this instance i.e. anyof('a', 'b') +- __str__(self, addFlags: bool=True) -> str +- __repr__(self) -> str +- __eq__(self, thing: InputType) -> bool +- __mul__(self, amt: int) +- __rmul__(self, amt: int) +- __imul__(self, amt: int) +- __add__(self, thing: InputType) +- __radd__(self, thing: InputType) +- __iadd__(self, thing: InputType) +- __and__(self, thing: InputType) +- __rand__(self, thing: InputType) +- __lshift__(self, thing: InputType) +- __rlshift__(self, thing: InputType) +- __ilshift__(self, thing: InputType) +- __rshift__(self, thing: InputType) +- __rrshift__(self, thing: InputType) +- __irshift__(self, thing: InputType) +- __invert__(self) -> str +- __pos__(self) + - TODO: Add documentation here +- __ror__(self, thing: InputType) + - TODO: Add documentation here +- __or__(self, thing: InputType) +- __xor__(self, thing: InputType) +- __rxor__(self, thing: InputType) +- __mod__(self, other: str) -> re.Match | None + - I would prefer __rmod__(), but it doesn't work on strings, since __mod__() is already specified for string formmating. +- __hash__(self) -> int +- __contains__(self, thing: str) -> bool +- __getitem__(self, args) +- __reversed__(self) -> str +- __rich__(self) -> str +- __pretty__(self) -> str +- __setattr__(self, name: str, value, ignore=False) +- __delattr__(self, *args) + +
+ +
perl
Positionals @@ -200,7 +563,7 @@ If you know a particular flavor of regex and would like to contribute, feel free
Literals -#### +#### - tab - space - space_or_tab @@ -219,7 +582,7 @@ If you know a particular flavor of regex and would like to contribute, feel free
Not Literals -#### +#### - not_whitespace - not_digit - not_word @@ -229,7 +592,7 @@ If you know a particular flavor of regex and would like to contribute, feel free
Catagories -#### +#### - whitespace - whitechunk - A "chunk" of whitespace. Just any amount of whitespace together @@ -258,7 +621,7 @@ If you know a particular flavor of regex and would like to contribute, feel free - alpha_num - unicode - Matches a unicode character by name -- any_between(char: str, and_char: str) +- any_between(char: str, and_char: str) - Match any char between `char` and `and_char`, using the ASCII table for reference
@@ -266,33 +629,33 @@ If you know a particular flavor of regex and would like to contribute, feel free
Amounts -#### -- match_max(input: InputType) +#### +- match_max(input: InputType) - Match as many of `input` in the string as you can. This is equivelent to using the unary + operator. If `input` is not provided, it works on the previous regex pattern. That's not recommended for clarity's sake though -- match_num(num: int, input: InputType) +- match_num(num: int, input: InputType) - Match `num` amount of `input` in the string -- match_more_than(min: int, input: InputType) +- more_than(min: int, input: InputType) - Match more than `min` sequences of `input` in the string -- match_at_least(min: int, input: InputType) +- at_least(min: int, input: InputType) - Match at least `min` sequences of `input` in the string -- match_at_most(max: int, input: InputType) +- at_most(max: int, input: InputType) - Match at most `max` instances of `input` in the string -- match_range(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) +- between(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) - Match between `min` and `max` sequences of `input` in the string. This also accepts `greedy` and `possessive` parameters Max can be an empty string to indicate no maximum `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) - Match at least one of `input` in the string. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) - Match 0 or more sequences of `input`. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible @@ -304,15 +667,15 @@ see https://docs.python.org/3/library/re.html for more help
Choices -#### -- optional(input: InputType, greedy: bool=True, possessive: bool=False) +#### +- optional(input: InputType, greedy: bool=True, possessive: bool=False) - Match `input` if it's there. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- either(input: InputType, or_input: InputType) -- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) +- either(input: InputType, or_input: InputType) +- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) - Match any of the given `inputs`. Note that `inputs` can be multiple parameters, or a single string. Can also accept parameters chars and split. If char is set to True, then `inputs` must only be a single string, it interprets `inputs` @@ -320,13 +683,13 @@ as characters, and splits it up to find any of the chars in the string. If split is set to true, it forces the ?(...) regex syntax instead of the [...] syntax. It should act the same way, but your output regex will look different. By default, it just optimizes it for you. -- any_char_except(*inputs: str) +- any_char_except(*inputs: str) - This matches any char that is NOT in `inputs`. `inputs` can be multiple parameters, or a single string of chars to split. -- any_except(input: InputType, type: InputType='.*') +- any_except(input: InputType, type: InputType='.*') - Matches anything other than `input`, which must be a single string or EZRegex chain, **not** a list. Also optionally accepts the `type` parameter, which works like this: "Match any `type` other than `input`". For example, "match any word which is not foo". Do note that this function is new, and I'm still working out the kinks. -- each(*inputs: InputType) +- each(*inputs: InputType) - Matches if the next part of the string can match all of the given inputs. Like the + operator, but out of order.
@@ -335,19 +698,19 @@ optionally accepts the `type` parameter, which works like this: "Match any `type Conditionals #### These can only be used once in a given expression. They only match a given expression if the expression is/ins'tfollowed/preceeded by a the given pattern -- if_proceded_by(input: InputType) +- if_proceded_by(input: InputType) - Matches the pattern if it has `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_not_proceded_by(input: InputType) +- if_not_proceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_preceded_by(input: InputType) +- if_preceded_by(input: InputType) - Matches the pattern if it has `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_not_preceded_by(input: InputType) +- if_not_preceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) +- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) - Matches if the string has `open`, then `stuff`, then `close`, but only "matches" stuff. Just a convenience combination of ifProceededBy and ifPreceededBy. @@ -356,16 +719,16 @@ stuff. Just a convenience combination of ifProceededBy and ifPreceededBy.
Grouping -#### -- group(input: InputType, name: str | None=None) +#### +- group(input: InputType, name: str | None=None) - Causes `input` to be captured as an unnamed group. Only useful when replacing regexs -- passive_group(input: InputType) +- passive_group(input: InputType) - As all regexs in EZRegex capture passively, this is entirely useless. But if you really want to, here it is -- earlier_group(num_or_name: int | str) +- earlier_group(num_or_name: int | str) - Matches whatever the group referenced by `num_or_name` matched earlier. Must be *after* a group which would match `num_or_name` -- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) - - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt` +- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) + - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt`
@@ -373,7 +736,7 @@ group which would match `num_or_name` Replacement #### In the intrest of "I don't want to think about any syntax at all", I have included replace members. Do note that theyare not interoperable with the other EZRegexs, and can only be used with other strings and each other. -- rgroup(num_or_name: str | int) +- rgroup(num_or_name: str | int) - Puts in its place the group specified, either by group number (for unnamed groups) or group name (for named groups). Named groups are also counted by number, I'm pretty sure. Groups are numbered starting from 1 @@ -401,12 +764,12 @@ There's a few of advantages to using this instead of just the regular regex repl
Misc -#### -- is_exactly(input: InputType) +#### +- is_exactly(input: InputType) - This matches the string if and only if the entire string is exactly equal to `input` -- literal(input: InputType) +- literal(input: InputType) - This is a redundant function. You should always be able to use `... + 'stuff'` just as easily as `... + literal('stuff')` -- raw(regex: str) +- raw(regex: str) - If you already have some regular regex written, and you want to incorperate it, this will allow you to include it without sanatizing all the backslaches and such, which all the other EZRegexs do automatically @@ -430,12 +793,6 @@ and such, which all the other EZRegexs do automatically - int_or_float - ow - "Optional Whitechunk" -- email - - Matches an email -- version - - The *official* regex for matching version numbers from https://semver.org/. It includes 5 groups that can bematched/replaced: `major`, `minor`, `patch`, `prerelease`, and `buildmetadata` -- version_numbered - - Same as `version`, but it uses numbered groups for each version number instead of named groups
@@ -460,7 +817,7 @@ and such, which all the other EZRegexs do automatically
- javascript
+ python
Positionals #### These differentiate the *string* starting with a sequence, and a *line* starting with a sequence. Do note that the startof the string is also the start of a line. These can also be called without parameters to denote the start/end of astring/line without something specific having to be next to it. @@ -478,7 +835,7 @@ and such, which all the other EZRegexs do automatically
Literals -#### +#### - tab - space - space_or_tab @@ -497,7 +854,7 @@ and such, which all the other EZRegexs do automatically
Not Literals -#### +#### - not_whitespace - not_digit - not_word @@ -507,7 +864,7 @@ and such, which all the other EZRegexs do automatically
Catagories -#### +#### - whitespace - whitechunk - A "chunk" of whitespace. Just any amount of whitespace together @@ -536,7 +893,7 @@ and such, which all the other EZRegexs do automatically - alpha_num - unicode - Matches a unicode character by name -- any_between(char: str, and_char: str) +- any_between(char: str, and_char: str) - Match any char between `char` and `and_char`, using the ASCII table for reference
@@ -544,33 +901,33 @@ and such, which all the other EZRegexs do automatically
Amounts -#### -- match_max(input: InputType) +#### +- match_max(input: InputType) - Match as many of `input` in the string as you can. This is equivelent to using the unary + operator. If `input` is not provided, it works on the previous regex pattern. That's not recommended for clarity's sake though -- match_num(num: int, input: InputType) +- match_num(num: int, input: InputType) - Match `num` amount of `input` in the string -- match_more_than(min: int, input: InputType) +- more_than(min: int, input: InputType) - Match more than `min` sequences of `input` in the string -- match_at_least(min: int, input: InputType) +- at_least(min: int, input: InputType) - Match at least `min` sequences of `input` in the string -- match_at_most(max: int, input: InputType) +- at_most(max: int, input: InputType) - Match at most `max` instances of `input` in the string -- match_range(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) +- between(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) - Match between `min` and `max` sequences of `input` in the string. This also accepts `greedy` and `possessive` parameters Max can be an empty string to indicate no maximum `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) - Match at least one of `input` in the string. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) - Match 0 or more sequences of `input`. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible @@ -582,15 +939,15 @@ see https://docs.python.org/3/library/re.html for more help
Choices -#### -- optional(input: InputType, greedy: bool=True, possessive: bool=False) +#### +- optional(input: InputType, greedy: bool=True, possessive: bool=False) - Match `input` if it's there. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- either(input: InputType, or_input: InputType) -- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) +- either(input: InputType, or_input: InputType) +- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) - Match any of the given `inputs`. Note that `inputs` can be multiple parameters, or a single string. Can also accept parameters chars and split. If char is set to True, then `inputs` must only be a single string, it interprets `inputs` @@ -598,13 +955,13 @@ as characters, and splits it up to find any of the chars in the string. If split is set to true, it forces the ?(...) regex syntax instead of the [...] syntax. It should act the same way, but your output regex will look different. By default, it just optimizes it for you. -- any_char_except(*inputs: str) +- any_char_except(*inputs: str) - This matches any char that is NOT in `inputs`. `inputs` can be multiple parameters, or a single string of chars to split. -- any_except(input: InputType, type: InputType='.*') +- any_except(input: InputType, type: InputType='.*') - Matches anything other than `input`, which must be a single string or EZRegex chain, **not** a list. Also optionally accepts the `type` parameter, which works like this: "Match any `type` other than `input`". For example, "match any word which is not foo". Do note that this function is new, and I'm still working out the kinks. -- each(*inputs: InputType) +- each(*inputs: InputType) - Matches if the next part of the string can match all of the given inputs. Like the + operator, but out of order.
@@ -613,19 +970,19 @@ optionally accepts the `type` parameter, which works like this: "Match any `type Conditionals #### These can only be used once in a given expression. They only match a given expression if the expression is/ins'tfollowed/preceeded by a the given pattern -- if_proceded_by(input: InputType) +- if_proceded_by(input: InputType) - Matches the pattern if it has `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_not_proceded_by(input: InputType) +- if_not_proceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_preceded_by(input: InputType) +- if_preceded_by(input: InputType) - Matches the pattern if it has `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_not_preceded_by(input: InputType) +- if_not_preceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) +- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) - Matches if the string has `open`, then `stuff`, then `close`, but only "matches" stuff. Just a convenience combination of ifProceededBy and ifPreceededBy. @@ -634,16 +991,16 @@ stuff. Just a convenience combination of ifProceededBy and ifPreceededBy.
Grouping -#### -- group(input: InputType, name: str | None=None) +#### +- group(input: InputType, name: str | None=None) - Causes `input` to be captured as an unnamed group. Only useful when replacing regexs -- passive_group(input: InputType) +- passive_group(input: InputType) - As all regexs in EZRegex capture passively, this is entirely useless. But if you really want to, here it is -- earlier_group(num_or_name: int | str) +- earlier_group(num_or_name: int | str) - Matches whatever the group referenced by `num_or_name` matched earlier. Must be *after* a group which would match `num_or_name` -- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) - - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt` +- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) + - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt`
@@ -651,7 +1008,7 @@ group which would match `num_or_name` Replacement #### In the intrest of "I don't want to think about any syntax at all", I have included replace members. Do note that theyare not interoperable with the other EZRegexs, and can only be used with other strings and each other. -- rgroup(num_or_name: str | int) +- rgroup(num_or_name: str | int) - Puts in its place the group specified, either by group number (for unnamed groups) or group name (for named groups). Named groups are also counted by number, I'm pretty sure. Groups are numbered starting from 1 @@ -679,12 +1036,12 @@ There's a few of advantages to using this instead of just the regular regex repl
Misc -#### -- is_exactly(input: InputType) +#### +- is_exactly(input: InputType) - This matches the string if and only if the entire string is exactly equal to `input` -- literal(input: InputType) +- literal(input: InputType) - This is a redundant function. You should always be able to use `... + 'stuff'` just as easily as `... + literal('stuff')` -- raw(regex: str) +- raw(regex: str) - If you already have some regular regex written, and you want to incorperate it, this will allow you to include it without sanatizing all the backslaches and such, which all the other EZRegexs do automatically @@ -708,12 +1065,6 @@ and such, which all the other EZRegexs do automatically - int_or_float - ow - "Optional Whitechunk" -- email - - Matches an email -- version - - The *official* regex for matching version numbers from https://semver.org/. It includes 5 groups that can bematched/replaced: `major`, `minor`, `patch`, `prerelease`, and `buildmetadata` -- version_numbered - - Same as `version`, but it uses numbered groups for each version number instead of named groups
@@ -733,12 +1084,21 @@ and such, which all the other EZRegexs do automatically - This is automatically inserted when using line_start and line_end, you shouldn't need to add it manually - UNICODE - Match using the full unicode standard, instead of just ASCII. Enabled by default, and therefore redundant. +- compile(self, add_flags: bool=True) +- search(self, string, pos: int=0, endpos: int=sys.maxsize) -> re.Match | None +- match(self, string, pos: int=0, endpos: int=sys.maxsize) -> re.Match | None +- fullmatch(self, string, pos: int=0, endpos: int=sys.maxsize) -> re.Match | None +- split(self, string, maxsplit: int=0) -> list +- findall(self, string, pos: int=0, endpos: int=sys.maxsize) -> list +- finditer(self, string, pos: int=0, endpos: int=sys.maxsize) -> Iterator[re.Match] +- sub(self, repl: Any | Callable[[re.Match], Any], string, count: int=0) +- subn(self, repl: Any | Callable[[re.Match], Any], string, count: int=0)
- python
+ javascript
Positionals #### These differentiate the *string* starting with a sequence, and a *line* starting with a sequence. Do note that the startof the string is also the start of a line. These can also be called without parameters to denote the start/end of astring/line without something specific having to be next to it. @@ -756,7 +1116,7 @@ and such, which all the other EZRegexs do automatically
Literals -#### +#### - tab - space - space_or_tab @@ -775,7 +1135,7 @@ and such, which all the other EZRegexs do automatically
Not Literals -#### +#### - not_whitespace - not_digit - not_word @@ -785,7 +1145,7 @@ and such, which all the other EZRegexs do automatically
Catagories -#### +#### - whitespace - whitechunk - A "chunk" of whitespace. Just any amount of whitespace together @@ -814,7 +1174,7 @@ and such, which all the other EZRegexs do automatically - alpha_num - unicode - Matches a unicode character by name -- any_between(char: str, and_char: str) +- any_between(char: str, and_char: str) - Match any char between `char` and `and_char`, using the ASCII table for reference
@@ -822,33 +1182,33 @@ and such, which all the other EZRegexs do automatically
Amounts -#### -- match_max(input: InputType) +#### +- match_max(input: InputType) - Match as many of `input` in the string as you can. This is equivelent to using the unary + operator. If `input` is not provided, it works on the previous regex pattern. That's not recommended for clarity's sake though -- match_num(num: int, input: InputType) +- match_num(num: int, input: InputType) - Match `num` amount of `input` in the string -- match_more_than(min: int, input: InputType) +- more_than(min: int, input: InputType) - Match more than `min` sequences of `input` in the string -- match_at_least(min: int, input: InputType) +- at_least(min: int, input: InputType) - Match at least `min` sequences of `input` in the string -- match_at_most(max: int, input: InputType) +- at_most(max: int, input: InputType) - Match at most `max` instances of `input` in the string -- match_range(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) +- between(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) - Match between `min` and `max` sequences of `input` in the string. This also accepts `greedy` and `possessive` parameters Max can be an empty string to indicate no maximum `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) - Match at least one of `input` in the string. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) +- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) - Match 0 or more sequences of `input`. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible @@ -860,15 +1220,15 @@ see https://docs.python.org/3/library/re.html for more help
Choices -#### -- optional(input: InputType, greedy: bool=True, possessive: bool=False) +#### +- optional(input: InputType, greedy: bool=True, possessive: bool=False) - Match `input` if it's there. This also accepts `greedy` and `possessive` parameters `greedy` means it will try to match as many repititions as possible non-greedy will try to match as few repititions as possible `possessive` means it won't backtrack to try to find any repitions see https://docs.python.org/3/library/re.html for more help -- either(input: InputType, or_input: InputType) -- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) +- either(input: InputType, or_input: InputType) +- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) - Match any of the given `inputs`. Note that `inputs` can be multiple parameters, or a single string. Can also accept parameters chars and split. If char is set to True, then `inputs` must only be a single string, it interprets `inputs` @@ -876,13 +1236,13 @@ as characters, and splits it up to find any of the chars in the string. If split is set to true, it forces the ?(...) regex syntax instead of the [...] syntax. It should act the same way, but your output regex will look different. By default, it just optimizes it for you. -- any_char_except(*inputs: str) +- any_char_except(*inputs: str) - This matches any char that is NOT in `inputs`. `inputs` can be multiple parameters, or a single string of chars to split. -- any_except(input: InputType, type: InputType='.*') +- any_except(input: InputType, type: InputType='.*') - Matches anything other than `input`, which must be a single string or EZRegex chain, **not** a list. Also optionally accepts the `type` parameter, which works like this: "Match any `type` other than `input`". For example, "match any word which is not foo". Do note that this function is new, and I'm still working out the kinks. -- each(*inputs: InputType) +- each(*inputs: InputType) - Matches if the next part of the string can match all of the given inputs. Like the + operator, but out of order.
@@ -891,19 +1251,19 @@ optionally accepts the `type` parameter, which works like this: "Match any `type Conditionals #### These can only be used once in a given expression. They only match a given expression if the expression is/ins'tfollowed/preceeded by a the given pattern -- if_proceded_by(input: InputType) +- if_proceded_by(input: InputType) - Matches the pattern if it has `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_not_proceded_by(input: InputType) +- if_not_proceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming after it. Can only be used once in a given pattern, as it only applies to the end -- if_preceded_by(input: InputType) +- if_preceded_by(input: InputType) - Matches the pattern if it has `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_not_preceded_by(input: InputType) +- if_not_preceded_by(input: InputType) - Matches the pattern if it does **not** have `input` coming before it. Can only be used once in a given pattern, as it only applies to the beginning -- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) +- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) - Matches if the string has `open`, then `stuff`, then `close`, but only "matches" stuff. Just a convenience combination of ifProceededBy and ifPreceededBy. @@ -912,16 +1272,16 @@ stuff. Just a convenience combination of ifProceededBy and ifPreceededBy.
Grouping -#### -- group(input: InputType, name: str | None=None) +#### +- group(input: InputType, name: str | None=None) - Causes `input` to be captured as an unnamed group. Only useful when replacing regexs -- passive_group(input: InputType) +- passive_group(input: InputType) - As all regexs in EZRegex capture passively, this is entirely useless. But if you really want to, here it is -- earlier_group(num_or_name: int | str) +- earlier_group(num_or_name: int | str) - Matches whatever the group referenced by `num_or_name` matched earlier. Must be *after* a group which would match `num_or_name` -- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) - - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt` +- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) + - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt`
@@ -929,7 +1289,7 @@ group which would match `num_or_name` Replacement #### In the intrest of "I don't want to think about any syntax at all", I have included replace members. Do note that theyare not interoperable with the other EZRegexs, and can only be used with other strings and each other. -- rgroup(num_or_name: str | int) +- rgroup(num_or_name: str | int) - Puts in its place the group specified, either by group number (for unnamed groups) or group name (for named groups). Named groups are also counted by number, I'm pretty sure. Groups are numbered starting from 1 @@ -957,12 +1317,12 @@ There's a few of advantages to using this instead of just the regular regex repl
Misc -#### -- is_exactly(input: InputType) +#### +- is_exactly(input: InputType) - This matches the string if and only if the entire string is exactly equal to `input` -- literal(input: InputType) +- literal(input: InputType) - This is a redundant function. You should always be able to use `... + 'stuff'` just as easily as `... + literal('stuff')` -- raw(regex: str) +- raw(regex: str) - If you already have some regular regex written, and you want to incorperate it, this will allow you to include it without sanatizing all the backslaches and such, which all the other EZRegexs do automatically @@ -986,12 +1346,6 @@ and such, which all the other EZRegexs do automatically - int_or_float - ow - "Optional Whitechunk" -- email - - Matches an email -- version - - The *official* regex for matching version numbers from https://semver.org/. It includes 5 groups that can bematched/replaced: `major`, `minor`, `patch`, `prerelease`, and `buildmetadata` -- version_numbered - - Same as `version`, but it uses numbered groups for each version number instead of named groups
@@ -1009,6 +1363,265 @@ and such, which all the other EZRegexs do automatically - Try not to use this, and rely on unicode matching instead - MULTILINE - This is automatically inserted when using line_start and line_end, you shouldn't need to add it manually +- UNICODE + - Match using the full unicode standard, instead of just ASCII. Enabled by default, and therefore redundant. + +
+ +
+
+ R
+ Positionals + +#### These differentiate the *string* starting with a sequence, and a *line* starting with a sequence. Do note that the startof the string is also the start of a line. These can also be called without parameters to denote the start/end of astring/line without something specific having to be next to it. +- string_starts_with +- string_ends_with +- word_boundary + - Matches the boundary of a word, i.e. the empty space between a word character and not a word character, or the end of a string +- not_word_boundary + - The opposite of `wordBoundary` + +
+ +
+ Literals + +#### +- tab +- space +- space_or_tab +- new_line +- carriage_return +- quote + - Matches ', ", and ` +- vertical_tab +- form_feed +- comma +- period +- underscore + +
+ +
+ Not Literals + +#### +- not_newline + - Matches any single character except line break characters, like the dot, but is not affected by any options that make the dot match all characters including line breaks. + +
+ +
+ Catagories + +#### +- whitespace +- whitechunk + - A "chunk" of whitespace. Just any amount of whitespace together +- digit +- number + - Matches multiple digits next to each other. Does not match negatives or decimals +- word +- word_char + - Matches just a single "word character", defined as any letter, number, or _ +- anything + - Matches any single character, except a newline. To also match a newline, use literally_anything +- chunk + - A "chunk": Any clump of characters up until the next newline +- uppercase +- lowercase +- letter + - Matches just a letter -- not numbers or _ like word_char +- hex_digit +- oct_digit +- punctuation +- controller + - Matches a metadata ASCII characters +- printable + - Matches printable ASCII characters +- printable_and_space +- alpha_num +- unicode + - Matches a unicode character by name +- any_between(char: str, and_char: str) + - Match any char between `char` and `and_char`, using the ASCII table for reference + +
+ +
+ Amounts + +#### +- match_max(input: InputType) + - Match as many of `input` in the string as you can. This is equivelent to using the unary + operator. +If `input` is not provided, it works on the previous regex pattern. That's not recommended for +clarity's sake though +- match_num(num: int, input: InputType) + - Match `num` amount of `input` in the string +- more_than(min: int, input: InputType) + - Match more than `min` sequences of `input` in the string +- at_least(min: int, input: InputType) + - Match at least `min` sequences of `input` in the string +- at_most(max: int, input: InputType) + - Match at most `max` instances of `input` in the string +- between(min: int, max: int, input: InputType, greedy: bool=True, possessive: bool=False) + - Match between `min` and `max` sequences of `input` in the string. This also accepts `greedy` and `possessive` parameters +Max can be an empty string to indicate no maximum +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- at_least_one(input: InputType, greedy: bool=True, possessive: bool=False) + - Match at least one of `input` in the string. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- at_least_none(input: InputType, greedy: bool=True, possessive: bool=False) + - Match 0 or more sequences of `input`. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help + +
+ +
+ Choices + +#### +- optional(input: InputType, greedy: bool=True, possessive: bool=False) + - Match `input` if it's there. This also accepts `greedy` and `possessive` parameters +`greedy` means it will try to match as many repititions as possible +non-greedy will try to match as few repititions as possible +`possessive` means it won't backtrack to try to find any repitions +see https://docs.python.org/3/library/re.html for more help +- either(input: InputType, or_input: InputType) +- any_of(*inputs: str, chars: bool | None=None, split: bool | None=None) + - Match any of the given `inputs`. Note that `inputs` can be multiple parameters, +or a single string. Can also accept parameters chars and split. If char is set +to True, then `inputs` must only be a single string, it interprets `inputs` +as characters, and splits it up to find any of the chars in the string. If +split is set to true, it forces the ?(...) regex syntax instead of the [...] +syntax. It should act the same way, but your output regex will look different. +By default, it just optimizes it for you. +- any_char_except(*inputs: str) + - This matches any char that is NOT in `inputs`. `inputs` can be multiple parameters, or a single string of chars to split. +- any_except(input: InputType, type: InputType='.*') + - Matches anything other than `input`, which must be a single string or EZRegex chain, **not** a list. Also +optionally accepts the `type` parameter, which works like this: "Match any `type` other than `input`". For example, +"match any word which is not foo". Do note that this function is new, and I'm still working out the kinks. +- each(*inputs: InputType) + - Matches if the next part of the string can match all of the given inputs. Like the + operator, but out of order. + +
+ +
+ Conditionals + +#### These can only be used once in a given expression. They only match a given expression if the expression is/ins'tfollowed/preceeded by a the given pattern +- if_proceded_by(input: InputType) + - Matches the pattern if it has `input` coming after it. Can only be used once in a given pattern, +as it only applies to the end +- if_not_proceded_by(input: InputType) + - Matches the pattern if it does **not** have `input` coming after it. Can only be used once in +a given pattern, as it only applies to the end +- if_preceded_by(input: InputType) + - Matches the pattern if it has `input` coming before it. Can only be used once in a given pattern, +as it only applies to the beginning +- if_not_preceded_by(input: InputType) + - Matches the pattern if it does **not** have `input` coming before it. Can only be used once +in a given pattern, as it only applies to the beginning +- if_enclosed_with(open: str, stuff: InputType, close: str | None=None) + - Matches if the string has `open`, then `stuff`, then `close`, but only "matches" +stuff. Just a convenience combination of ifProceededBy and ifPreceededBy. + +
+ +
+ Grouping + +#### +- group(input: InputType, name: str | None=None) + - Causes `input` to be captured as an unnamed group. Only useful when replacing regexs +- passive_group(input: InputType) + - As all regexs in EZRegex capture passively, this is entirely useless. But if you really want to, here it is +- earlier_group(num_or_name: int | str) + - Matches whatever the group referenced by `num_or_name` matched earlier. Must be *after* a +group which would match `num_or_name` +- if_exists(num_or_name: int | str, does: InputType, doesnt: InputType | None=None) + - Matches `does` if the group `num_or_name` exists, otherwise it matches `doesnt` + +
+ +
+ Replacement + +#### In the intrest of "I don't want to think about any syntax at all", I have included replace members. Do note that theyare not interoperable with the other EZRegexs, and can only be used with other strings and each other. +- rgroup(num_or_name: str | int) + - Puts in its place the group specified, either by group number (for unnamed +groups) or group name (for named groups). Named groups are also counted by +number, I'm pretty sure. Groups are numbered starting from 1 +- replace_entire + - Puts in its place the entire match +- replace(string: str, rtn_str: bool=True) -> str | EZRegex + - Generates a valid regex replacement string, using Python f-string like syntax. + +Example: + ``` replace("named: {group}, numbered: {1}, entire: {0}") ``` + +Like Python f-strings, use {{ and }} to specify { and } + +Set the `rtn_str` parameter to True to have it return an EZRegex type instead of a string + +Note: Remember that index 0 is the entire match + +There's a few of advantages to using this instead of just the regular regex replacement syntax: +- It's consistent between dialects +- It's closer to Python f-string syntax, which is cleaner and more familiar +- It handles numbered, named, and entire replacement types the same + +
+ +
+ Misc + +#### +- is_exactly(input: InputType) + - This matches the string if and only if the entire string is exactly equal to `input` +- literal(input: InputType) + - This is a redundant function. You should always be able to use `... + 'stuff'` just as easily as `... + literal('stuff')` +- raw(regex: str) + - If you already have some regular regex written, and you want to incorperate +it, this will allow you to include it without sanatizing all the backslaches +and such, which all the other EZRegexs do automatically + +
+ +
+ Premade + +#### These are some useful combinations that may be commonly used. They are not as stable, and may be changed and added to inlater versions to make them more accurate +- literally_anything + - *Any* character, include newline +- signed + - a signed number, including 123, -123, and +123 +- unsigned + - Same as number. Will not match +123 +- plain_float + - Will match 123.45 and 123. +- full_float + - Will match plain_float as well as things like 1.23e-10 and 1.23e+10 +- int_or_float +- ow + - "Optional Whitechunk" + +
+ +
+ Flags + +#### These shadow python regex flags, and can just as easily be specified directly to the re library instead. They're providedhere for compatibility with other regex dialects. See https://docs.python.org/3/library/re.html#flags for details
diff --git a/ezregex/EZRegex.py b/ezregex/EZRegex.py index 00c7d3d..7489b66 100644 --- a/ezregex/EZRegex.py +++ b/ezregex/EZRegex.py @@ -39,6 +39,9 @@ def __init__(self, definition, *, sanatize=True, replacement=False, flags=''): def _flag_func(self, final:str) -> str: raise NotImplementedError('Subclasses need to implement _flag_func(final)') + def _final_func(self, s:str) -> str: + return s + def _escape(self, pattern:str): """ This function was modified from the one in /usr/lib64/python3.12/re/__init__.py line 255 """ _special_chars_map = {i: '\\' + chr(i) for i in self._escape_chars} @@ -81,6 +84,9 @@ def _compile(self, add_flags=True): if len(self._flags): regex = self._flag_func(regex) + + # This has to go in the add_flags scope so it only runs at the very end, like flags + regex = self._final_func(regex) return regex def _copy(self, definition=..., sanatize=..., replacement=..., flags=...): diff --git a/ezregex/R/REZRegex.py b/ezregex/R/REZRegex.py new file mode 100644 index 0000000..2894566 --- /dev/null +++ b/ezregex/R/REZRegex.py @@ -0,0 +1,13 @@ +from ..EZRegex import EZRegex + + +class REZRegex(EZRegex): + _escape_chars=b'()[]{}?*+-|^$\\.&~# ' + _end = '' + _beginning = '' + + def _flag_func(self, final): + return f'(?{self.flags}){final}' + + def _final_func(self, s:str) -> str: + return s.replace('\\', '\\\\') diff --git a/ezregex/R/__init__.py b/ezregex/R/__init__.py new file mode 100644 index 0000000..283a405 --- /dev/null +++ b/ezregex/R/__init__.py @@ -0,0 +1,5 @@ +""" Support for the Perl dialect of regular expressions""" +__version__ = '0.0.1' + +from .elements import * +from .REZRegex import REZRegex diff --git a/ezregex/R/elements.py b/ezregex/R/elements.py new file mode 100644 index 0000000..879aa68 --- /dev/null +++ b/ezregex/R/elements.py @@ -0,0 +1,27 @@ +# pyright: reportArgumentType = false +# pyright: reportUndefinedVariable = false +from ..base import load_base +from ..EZRegex import EZRegex +from .REZRegex import REZRegex + +globals().update(load_base(REZRegex, lambda num_or_name, cur=...: fr'{cur}\g{{{num_or_name}}}')) + +# I can't figure out how flags work in R, so I'm just ignoring them +del line_starts_with +del lineStartsWith +del line_start +del lineStart +del line_ends_with +del lineEndsWith +del line_end +del lineEnd + +del ASCII +del DOTALL +del IGNORECASE +del LOCALE +del MULTILINE +del UNICODE + +# Matches any single character except line break characters, like the dot, but is not affected by any options that make the dot match all characters including line breaks. +not_newline = REZRegex(r'\N') diff --git a/ezregex/R/elements.pyi b/ezregex/R/elements.pyi new file mode 100644 index 0000000..beb93ce --- /dev/null +++ b/ezregex/R/elements.pyi @@ -0,0 +1,23 @@ +from .REZRegex import REZRegex +from ..base.interface import * + +# I can't figure out how flags work in R, so I'm just ignoring them +del line_starts_with +del lineStartsWith +del line_start +del lineStart +del line_ends_with +del lineEndsWith +del line_end +del lineEnd + +del ASCII +del DOTALL +del IGNORECASE +del LOCALE +del MULTILINE +del UNICODE + +"Group: Not Literals" +not_newline: REZRegex = REZRegex(r'\N') +'Matches any single character except line break characters, like the dot, but is not affected by any options that make the dot match all characters including line breaks.' diff --git a/ezregex/_docs.py b/ezregex/_docs.py index 826480d..3a923ff 100644 --- a/ezregex/_docs.py +++ b/ezregex/_docs.py @@ -9,7 +9,8 @@ # ASSUMPTION: Groups are designated using the form "Group: \n" # ASSUMPTION: strings below variables act as the descriptions for those variables # ASSUMPTION: There aren't any extraneous variables or functions in the .pyi dialect files - +# TODO: This needs to add to groups instead of setting groups, so if we specify a group in a dialect, it doesn't reset +# the docs in that group to just the ones specified in the dialect, it also includes the base ones class DocGenerator(ast.NodeVisitor): """ This parses the .pyi file and gets all the relevant info out of it """ def __init__(self, node) -> None: diff --git a/tests/test_R.py b/tests/test_R.py new file mode 100644 index 0000000..50124e8 --- /dev/null +++ b/tests/test_R.py @@ -0,0 +1,8 @@ +import jstyleson +from ezregex.R import * +from ezregex import EZRegex, R + + +def test_R(): + print(word + group(digit + '45') + raw('\\w+')) + assert str(word + group(digit + '45') + raw('\\w+')) == r'\\w+(\\d45)\\w+'