Skip to content

Commit

Permalink
Support parsing URIs (http/https only)
Browse files Browse the repository at this point in the history
  • Loading branch information
stsewd committed Jun 4, 2023
1 parent 115e236 commit 5b3c920
Show file tree
Hide file tree
Showing 8 changed files with 712 additions and 301 deletions.
Binary file modified docs/js/tree-sitter-comment.wasm
Binary file not shown.
62 changes: 52 additions & 10 deletions grammar.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
const NEWLINE = /\r?\n/;
const END_CHARS = [
".",
",",
":",
";",
"!",
"?",
"\\",
"'",
'"',
"}",
"]",
")",
">",
// This must be last, so that it isn't interpreted as a range.
"-",
];

const STOP_CHARS = [
"/",
"'",
Expand All @@ -25,14 +42,16 @@ const STOP_CHARS = [
module.exports = grammar({
name: "comment",

externals: ($) => [$.name, $.invalid_token],

extras: ($) => [$.__newline, /\s/],
externals: ($) => [
$.name,
$.invalid_token
],

rules: {
source: ($) => repeat(
choice(
$.tag,
$._full_uri,
alias($._text, "text"),
),
),
Expand All @@ -49,21 +68,44 @@ module.exports = grammar({
")",
),

// This token is split into two parts so the end character isn't included in the URI itself.
_full_uri: ($) => seq($.uri, choice(alias($._end_char, "text"), /\s/)),

// This token needs to be single regex, otherwise a partial match will result in an error.
uri: ($) => get_uri_regex(),

// Text tokens can be a single character, or a sequence of characters that aren't stop characters.
_text: ($) => choice($._stop_char, notmatching(STOP_CHARS)),
_stop_char: ($) => choice(...STOP_CHARS),

// HACK: for some reason this needs be assigned to a token,
// otherwise isn't recognized as an extra.
__newline: ($) => NEWLINE,
_end_char: ($) => choice(...END_CHARS),
},
});

/**
* Get a regex that matches a URI.
*
* A URI matches if:
*
* - It starts with http:// or https://
* - It contains at least one character that isn't whitespace or an end character.
* - If it contains an end character, it must be followed by a letter or number (.com).
* - It doesn't end with a whitespace or an end character (this marks the end of the URI).
*
* An end character is a character that marks the end of a sentence.
*/
function get_uri_regex() {
let end_chars = escapeRegExp(END_CHARS.join(""));
return new RegExp(
`https?://([^\\s${end_chars}]|[${end_chars}][a-zA-Z0-9]+)*[^\\s${end_chars}]`
);
}

/**
* Match any characters that aren't whitespace or that aren't in the given list.
*/
function notmatching(chars) {
chars = chars.join("");
return new RegExp(`[^\r\n\\s${escapeRegExp(chars)}]+`);
chars = escapeRegExp(chars.join(""));
return new RegExp(`[^\\s${chars}]+`);
}

/**
Expand Down
104 changes: 96 additions & 8 deletions src/grammar.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
"type": "SYMBOL",
"name": "tag"
},
{
"type": "SYMBOL",
"name": "_full_uri"
},
{
"type": "ALIAS",
"content": {
Expand Down Expand Up @@ -69,6 +73,37 @@
}
]
},
"_full_uri": {
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "uri"
},
{
"type": "CHOICE",
"members": [
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_end_char"
},
"named": false,
"value": "text"
},
{
"type": "PATTERN",
"value": "\\s"
}
]
}
]
},
"uri": {
"type": "PATTERN",
"value": "https?:\\/\\/([^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>-]|[\\.,:;!\\?\\\\'\"\\}\\]\\)>-][a-zA-Z0-9]+)*[^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>-]"
},
"_text": {
"type": "CHOICE",
"members": [
Expand All @@ -78,7 +113,7 @@
},
{
"type": "PATTERN",
"value": "[^\\r\\n\\s/'\"<\\(\\[\\{\\.,:;!\\?\\\\\\}\\]\\)>-]+"
"value": "[^\\s/'\"<\\(\\[\\{\\.,:;!\\?\\\\\\}\\]\\)>-]+"
}
]
},
Expand Down Expand Up @@ -163,16 +198,69 @@
}
]
},
"__newline": {
"type": "PATTERN",
"value": "\\r?\\n"
"_end_char": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "."
},
{
"type": "STRING",
"value": ","
},
{
"type": "STRING",
"value": ":"
},
{
"type": "STRING",
"value": ";"
},
{
"type": "STRING",
"value": "!"
},
{
"type": "STRING",
"value": "?"
},
{
"type": "STRING",
"value": "\\"
},
{
"type": "STRING",
"value": "'"
},
{
"type": "STRING",
"value": "\""
},
{
"type": "STRING",
"value": "}"
},
{
"type": "STRING",
"value": "]"
},
{
"type": "STRING",
"value": ")"
},
{
"type": "STRING",
"value": ">"
},
{
"type": "STRING",
"value": "-"
}
]
}
},
"extras": [
{
"type": "SYMBOL",
"name": "__newline"
},
{
"type": "PATTERN",
"value": "\\s"
Expand Down
8 changes: 8 additions & 0 deletions src/node-types.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
{
"type": "tag",
"named": true
},
{
"type": "uri",
"named": true
}
]
}
Expand Down Expand Up @@ -110,6 +114,10 @@
"type": "name",
"named": true
},
{
"type": "uri",
"named": true
},
{
"type": "user",
"named": true
Expand Down

0 comments on commit 5b3c920

Please sign in to comment.