Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f3bf8c9
add raw string index location to tokens
inferrinizzard Aug 4, 2022
5159b38
update editorconfig
inferrinizzard Aug 4, 2022
8960511
fix token insertion cases
inferrinizzard Aug 4, 2022
52c4d24
add line + col token position
inferrinizzard Aug 4, 2022
b426b5f
import LINEBREAK_REGEX from regexUtils
inferrinizzard Aug 4, 2022
46656c3
add basic token position test
inferrinizzard Aug 4, 2022
01166b9
fix line counter bug
inferrinizzard Aug 4, 2022
5b5cf10
Merge branch 'master' into lexer/tokenizer-location
inferrinizzard Aug 4, 2022
31b71b6
reorder LINEBEAK_REGEX priority
inferrinizzard Aug 4, 2022
9eb302b
update token attribute comments
inferrinizzard Aug 4, 2022
edada2d
rename index to start
inferrinizzard Aug 7, 2022
f7e1944
add end attribute to token position
inferrinizzard Aug 7, 2022
8b5e021
add helper function to update line and col numbers
inferrinizzard Aug 7, 2022
39cd207
move position updates to after matchedToken creation
inferrinizzard Aug 7, 2022
5ff4499
add basic multi-line position test
inferrinizzard Aug 7, 2022
c07c10c
make line and col 1-based
inferrinizzard Aug 7, 2022
ef8a95e
rm use_tabs
inferrinizzard Aug 7, 2022
b0c95e5
Merge branch 'master' into lexer/tokenizer-location
inferrinizzard Aug 9, 2022
2eaf905
remove line and col attributes
inferrinizzard Aug 9, 2022
ed20517
update EOF token
inferrinizzard Aug 10, 2022
1aa7f20
rm tokenPosition.test.ts
inferrinizzard Aug 10, 2022
1ba93b0
rm LINEBREAK_REGEX
inferrinizzard Aug 10, 2022
41fbe74
fix position for bigquery nested angle tokens
inferrinizzard Aug 10, 2022
1d737e5
use Infinity
inferrinizzard Aug 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/languages/bigquery/bigquery.formatter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ function combineParameterizedTypes(tokens: Token[]) {
type: TokenType.IDENTIFIER,
raw: typeDefTokens.map(formatTypeDefToken('raw')).join(''),
text: typeDefTokens.map(formatTypeDefToken('text')).join(''),
start: token.start,
end: token.end + typeDefTokens.map(t => t.text.length).reduce((a, b) => a + b),
});
i = endIndex;
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/languages/spark/spark.formatter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ function postProcess(tokens: Token[]) {
if (token.text === 'ITEMS' && token.type === TokenType.RESERVED_KEYWORD) {
if (!(prevToken.text === 'COLLECTION' && nextToken.text === 'TERMINATED')) {
// this is a word and not COLLECTION ITEMS
return { type: TokenType.IDENTIFIER, raw: token.raw, text: token.raw };
return { ...token, type: TokenType.IDENTIFIER, text: token.raw };
}
}

Expand Down
18 changes: 11 additions & 7 deletions src/lexer/TokenizerEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@ export interface TokenRule {
export default class TokenizerEngine {
private rules: Partial<Record<TokenType, TokenRule>>;

// The input SQL string to process
private input = '';
// Current position in string
private index = 0;
private input = ''; // The input SQL string to process

private index = 0; // Current position in string

constructor(rules: Partial<Record<TokenType, TokenRule>>) {
this.rules = rules;
Expand Down Expand Up @@ -52,6 +51,7 @@ export default class TokenizerEngine {

private skipWhitespace(): void {
WHITESPACE_REGEX.lastIndex = this.index;

const matches = WHITESPACE_REGEX.exec(this.input);
if (matches) {
// Advance current position by matched whitespace length
Expand Down Expand Up @@ -145,13 +145,17 @@ export default class TokenizerEngine {
if (matches) {
const matchedToken = matches[0];

// Advance current position by matched token length
this.index += matchedToken.length;
return {
const outToken = {
type,
raw: matchedToken,
text: transform ? transform(matchedToken) : matchedToken,
start: this.index,
end: this.index + matchedToken.length,
};

// Advance current position by matched token length
this.index += matchedToken.length;
return outToken;
}
return undefined;
}
Expand Down
10 changes: 9 additions & 1 deletion src/lexer/token.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,21 @@ export interface Token {
raw: string; // The raw original text that was matched
text: string; // Cleaned up text e.g. keyword converted to uppercase and extra spaces removed
key?: string;
start: number; // 0-based index of the token in the whole query string
end: number; // 0-based index of where the token ends in the query string
}

/**
* For use as a "missing token"
* e.g. in lookAhead and lookBehind to avoid dealing with null values
*/
export const EOF_TOKEN = { type: TokenType.EOF, raw: '«EOF»', text: '«EOF»' };
export const EOF_TOKEN: Token = {
type: TokenType.EOF,
raw: '«EOF»',
text: '«EOF»',
start: Infinity,
end: Infinity,
};

/** Checks if two tokens are equivalent */
export const testToken =
Expand Down
56 changes: 56 additions & 0 deletions test/unit/Parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ describe('Parser', () => {
"children": Array [
Object {
"token": Object {
"end": 3,
"raw": "foo",
"start": 0,
"text": "foo",
"type": "IDENTIFIER",
},
Expand All @@ -43,7 +45,9 @@ describe('Parser', () => {
"children": Array [
Object {
"token": Object {
"end": 8,
"raw": "bar",
"start": 5,
"text": "bar",
"type": "IDENTIFIER",
},
Expand All @@ -66,15 +70,19 @@ describe('Parser', () => {
"children": Array [
Object {
"nameToken": Object {
"end": 11,
"raw": "SQRT",
"start": 7,
"text": "SQRT",
"type": "RESERVED_FUNCTION_NAME",
},
"parenthesis": Object {
"children": Array [
Object {
"token": Object {
"end": 13,
"raw": "2",
"start": 12,
"text": "2",
"type": "NUMBER",
},
Expand All @@ -89,7 +97,9 @@ describe('Parser', () => {
},
],
"nameToken": Object {
"end": 6,
"raw": "SELECT",
"start": 0,
"text": "SELECT",
"type": "RESERVED_COMMAND",
},
Expand All @@ -112,23 +122,29 @@ describe('Parser', () => {
"children": Array [
Object {
"arrayToken": Object {
"end": 15,
"raw": "my_array",
"start": 7,
"text": "my_array",
"type": "IDENTIFIER",
},
"parenthesis": Object {
"children": Array [
Object {
"nameToken": Object {
"end": 22,
"raw": "OFFSET",
"start": 16,
"text": "OFFSET",
"type": "RESERVED_FUNCTION_NAME",
},
"parenthesis": Object {
"children": Array [
Object {
"token": Object {
"end": 24,
"raw": "5",
"start": 23,
"text": "5",
"type": "NUMBER",
},
Expand All @@ -150,7 +166,9 @@ describe('Parser', () => {
},
],
"nameToken": Object {
"end": 6,
"raw": "SELECT",
"start": 0,
"text": "SELECT",
"type": "RESERVED_COMMAND",
},
Expand All @@ -175,15 +193,19 @@ describe('Parser', () => {
"children": Array [
Object {
"token": Object {
"end": 18,
"raw": "birth_year",
"start": 8,
"text": "birth_year",
"type": "IDENTIFIER",
},
"type": "token",
},
Object {
"token": Object {
"end": 20,
"raw": "-",
"start": 19,
"text": "-",
"type": "OPERATOR",
},
Expand All @@ -193,23 +215,29 @@ describe('Parser', () => {
"children": Array [
Object {
"token": Object {
"end": 34,
"raw": "CURRENT_DATE",
"start": 22,
"text": "CURRENT_DATE",
"type": "IDENTIFIER",
},
"type": "token",
},
Object {
"token": Object {
"end": 36,
"raw": "+",
"start": 35,
"text": "+",
"type": "OPERATOR",
},
"type": "token",
},
Object {
"token": Object {
"end": 38,
"raw": "1",
"start": 37,
"text": "1",
"type": "NUMBER",
},
Expand All @@ -227,7 +255,9 @@ describe('Parser', () => {
},
],
"nameToken": Object {
"end": 6,
"raw": "SELECT",
"start": 0,
"text": "SELECT",
"type": "RESERVED_COMMAND",
},
Expand All @@ -250,38 +280,50 @@ describe('Parser', () => {
"children": Array [
Object {
"token": Object {
"end": 9,
"raw": "age",
"start": 6,
"text": "age",
"type": "IDENTIFIER",
},
"type": "token",
},
Object {
"andToken": Object {
"end": 24,
"raw": "and",
"start": 21,
"text": "AND",
"type": "RESERVED_LOGICAL_OPERATOR",
},
"betweenToken": Object {
"end": 17,
"raw": "BETWEEN",
"start": 10,
"text": "BETWEEN",
"type": "RESERVED_KEYWORD",
},
"expr1": Object {
"end": 20,
"raw": "10",
"start": 18,
"text": "10",
"type": "NUMBER",
},
"expr2": Object {
"end": 27,
"raw": "15",
"start": 25,
"text": "15",
"type": "NUMBER",
},
"type": "between_predicate",
},
],
"nameToken": Object {
"end": 5,
"raw": "WHERE",
"start": 0,
"text": "WHERE",
"type": "RESERVED_COMMAND",
},
Expand All @@ -304,15 +346,19 @@ describe('Parser', () => {
"count": Array [
Object {
"token": Object {
"end": 8,
"raw": "10",
"start": 6,
"text": "10",
"type": "NUMBER",
},
"type": "token",
},
],
"limitToken": Object {
"end": 5,
"raw": "LIMIT",
"start": 0,
"text": "LIMIT",
"type": "RESERVED_COMMAND",
},
Expand All @@ -335,22 +381,28 @@ describe('Parser', () => {
"count": Array [
Object {
"token": Object {
"end": 13,
"raw": "10",
"start": 11,
"text": "10",
"type": "NUMBER",
},
"type": "token",
},
],
"limitToken": Object {
"end": 5,
"raw": "LIMIT",
"start": 0,
"text": "LIMIT",
"type": "RESERVED_COMMAND",
},
"offset": Array [
Object {
"token": Object {
"end": 9,
"raw": "200",
"start": 6,
"text": "200",
"type": "NUMBER",
},
Expand Down Expand Up @@ -379,7 +431,9 @@ describe('Parser', () => {
},
],
"nameToken": Object {
"end": 6,
"raw": "SELECT",
"start": 0,
"text": "SELECT",
"type": "RESERVED_COMMAND",
},
Expand All @@ -405,7 +459,9 @@ describe('Parser', () => {
},
],
"nameToken": Object {
"end": 15,
"raw": "SELECT DISTINCT",
"start": 0,
"text": "SELECT DISTINCT",
"type": "RESERVED_COMMAND",
},
Expand Down