-
Notifications
You must be signed in to change notification settings - Fork 127
/
N3Lexer.js
522 lines (484 loc) · 20.7 KB
/
N3Lexer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
// **N3Lexer** tokenizes N3 documents.
import { Buffer } from 'buffer';
import queueMicrotask from 'queue-microtask';
import namespaces from './IRIs';
const { xsd } = namespaces;
// Regular expression and replacement string to escape N3 strings
const escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\([^])/g;
const escapeReplacements = {
'\\': '\\', "'": "'", '"': '"',
'n': '\n', 'r': '\r', 't': '\t', 'f': '\f', 'b': '\b',
'_': '_', '~': '~', '.': '.', '-': '-', '!': '!', '$': '$', '&': '&',
'(': '(', ')': ')', '*': '*', '+': '+', ',': ',', ';': ';', '=': '=',
'/': '/', '?': '?', '#': '#', '@': '@', '%': '%',
};
const illegalIriChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;
const lineModeRegExps = {
_iri: true,
_unescapedIri: true,
_simpleQuotedString: true,
_langcode: true,
_blank: true,
_newline: true,
_comment: true,
_whitespace: true,
_endOfFile: true,
};
const invalidRegExp = /$0^/;
// ## Constructor
export default class N3Lexer {
constructor(options) {
// ## Regular expressions
// It's slightly faster to have these as properties than as in-scope variables
this._iri = /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/; // IRI with escape sequences; needs sanity check after unescaping
this._unescapedIri = /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/; // IRI without escape sequences; no unescaping
this._simpleQuotedString = /^"([^"\\\r\n]*)"(?=[^"])/; // string without escape sequences
this._simpleApostropheString = /^'([^'\\\r\n]*)'(?=[^'])/;
this._langcode = /^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i;
this._prefix = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:(?=[#\s<])/;
this._prefixed = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}"'<>]))/;
this._variable = /^\?(?:(?:[A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?=[.,;!\^\s#()\[\]\{\}"'<>])/;
this._blank = /^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?:[ \t]+|(?=\.?[,;:\s#()\[\]\{\}"'<>]))/;
this._number = /^[\-+]?(?:(\d+\.\d*|\.?\d+)[eE][\-+]?|\d*(\.)?)\d+(?=\.?[,;:\s#()\[\]\{\}"'<>])/;
this._boolean = /^(?:true|false)(?=[.,;\s#()\[\]\{\}"'<>])/;
this._keyword = /^@[a-z]+(?=[\s#<:])/i;
this._sparqlKeyword = /^(?:PREFIX|BASE|GRAPH)(?=[\s#<])/i;
this._shortPredicates = /^a(?=[\s#()\[\]\{\}"'<>])/;
this._newline = /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/;
this._comment = /#([^\n\r]*)/;
this._whitespace = /^[ \t]+/;
this._endOfFile = /^(?:#[^\n\r]*)?$/;
options = options || {};
// In line mode (N-Triples or N-Quads), only simple features may be parsed
if (this._lineMode = !!options.lineMode) {
this._n3Mode = false;
// Don't tokenize special literals
for (const key in this) {
if (!(key in lineModeRegExps) && this[key] instanceof RegExp)
this[key] = invalidRegExp;
}
}
// When not in line mode, enable N3 functionality by default
else {
this._n3Mode = options.n3 !== false;
}
// Don't output comment tokens by default
this._comments = !!options.comments;
// Cache the last tested closing position of long literals
this._literalClosingPos = 0;
}
// ## Private methods
// ### `_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback
_tokenizeToEnd(callback, inputFinished) {
// Continue parsing as far as possible; the loop will return eventually
let input = this._input;
let currentLineLength = input.length;
while (true) {
// Count and skip whitespace lines
let whiteSpaceMatch, comment;
while (whiteSpaceMatch = this._newline.exec(input)) {
// Try to find a comment
if (this._comments && (comment = this._comment.exec(whiteSpaceMatch[0])))
emitToken('comment', comment[1], '', this._line, whiteSpaceMatch[0].length);
// Advance the input
input = input.substr(whiteSpaceMatch[0].length, input.length);
currentLineLength = input.length;
this._line++;
}
// Skip whitespace on current line
if (!whiteSpaceMatch && (whiteSpaceMatch = this._whitespace.exec(input)))
input = input.substr(whiteSpaceMatch[0].length, input.length);
// Stop for now if we're at the end
if (this._endOfFile.test(input)) {
// If the input is finished, emit EOF
if (inputFinished) {
// Try to find a final comment
if (this._comments && (comment = this._comment.exec(input)))
emitToken('comment', comment[1], '', this._line, input.length);
input = null;
emitToken('eof', '', '', this._line, 0);
}
return this._input = input;
}
// Look for specific token types based on the first character
const line = this._line, firstChar = input[0];
let type = '', value = '', prefix = '',
match = null, matchLength = 0, inconclusive = false;
switch (firstChar) {
case '^':
// We need at least 3 tokens lookahead to distinguish ^^<IRI> and ^^pre:fixed
if (input.length < 3)
break;
// Try to match a type
else if (input[1] === '^') {
this._previousMarker = '^^';
// Move to type IRI or prefixed name
input = input.substr(2);
if (input[0] !== '<') {
inconclusive = true;
break;
}
}
// If no type, it must be a path expression
else {
if (this._n3Mode) {
matchLength = 1;
type = '^';
}
break;
}
// Fall through in case the type is an IRI
case '<':
// Try to find a full IRI without escape sequences
if (match = this._unescapedIri.exec(input))
type = 'IRI', value = match[1];
// Try to find a full IRI with escape sequences
else if (match = this._iri.exec(input)) {
value = this._unescape(match[1]);
if (value === null || illegalIriChars.test(value))
return reportSyntaxError(this);
type = 'IRI';
}
// Try to find a nested triple
else if (input.length > 1 && input[1] === '<')
type = '<<', matchLength = 2;
// Try to find a backwards implication arrow
else if (this._n3Mode && input.length > 1 && input[1] === '=')
type = 'inverse', matchLength = 2, value = '>';
break;
case '>':
if (input.length > 1 && input[1] === '>')
type = '>>', matchLength = 2;
break;
case '_':
// Try to find a blank node. Since it can contain (but not end with) a dot,
// we always need a non-dot character before deciding it is a blank node.
// Therefore, try inserting a space if we're at the end of the input.
if ((match = this._blank.exec(input)) ||
inputFinished && (match = this._blank.exec(`${input} `)))
type = 'blank', prefix = '_', value = match[1];
break;
case '"':
// Try to find a literal without escape sequences
if (match = this._simpleQuotedString.exec(input))
value = match[1];
// Try to find a literal wrapped in three pairs of quotes
else {
({ value, matchLength } = this._parseLiteral(input));
if (value === null)
return reportSyntaxError(this);
}
if (match !== null || matchLength !== 0) {
type = 'literal';
this._literalClosingPos = 0;
}
break;
case "'":
if (!this._lineMode) {
// Try to find a literal without escape sequences
if (match = this._simpleApostropheString.exec(input))
value = match[1];
// Try to find a literal wrapped in three pairs of quotes
else {
({ value, matchLength } = this._parseLiteral(input));
if (value === null)
return reportSyntaxError(this);
}
if (match !== null || matchLength !== 0) {
type = 'literal';
this._literalClosingPos = 0;
}
}
break;
case '?':
// Try to find a variable
if (this._n3Mode && (match = this._variable.exec(input)))
type = 'var', value = match[0];
break;
case '@':
// Try to find a language code
if (this._previousMarker === 'literal' && (match = this._langcode.exec(input)))
type = 'langcode', value = match[1];
// Try to find a keyword
else if (match = this._keyword.exec(input))
type = match[0];
break;
case '.':
// Try to find a dot as punctuation
if (input.length === 1 ? inputFinished : (input[1] < '0' || input[1] > '9')) {
type = '.';
matchLength = 1;
break;
}
// Fall through to numerical case (could be a decimal dot)
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '+':
case '-':
// Try to find a number. Since it can contain (but not end with) a dot,
// we always need a non-dot character before deciding it is a number.
// Therefore, try inserting a space if we're at the end of the input.
if (match = this._number.exec(input) ||
inputFinished && (match = this._number.exec(`${input} `))) {
type = 'literal', value = match[0];
prefix = (typeof match[1] === 'string' ? xsd.double :
(typeof match[2] === 'string' ? xsd.decimal : xsd.integer));
}
break;
case 'B':
case 'b':
case 'p':
case 'P':
case 'G':
case 'g':
// Try to find a SPARQL-style keyword
if (match = this._sparqlKeyword.exec(input))
type = match[0].toUpperCase();
else
inconclusive = true;
break;
case 'f':
case 't':
// Try to match a boolean
if (match = this._boolean.exec(input))
type = 'literal', value = match[0], prefix = xsd.boolean;
else
inconclusive = true;
break;
case 'a':
// Try to find an abbreviated predicate
if (match = this._shortPredicates.exec(input))
type = 'abbreviation', value = 'a';
else
inconclusive = true;
break;
case '=':
// Try to find an implication arrow or equals sign
if (this._n3Mode && input.length > 1) {
type = 'abbreviation';
if (input[1] !== '>')
matchLength = 1, value = '=';
else
matchLength = 2, value = '>';
}
break;
case '!':
if (!this._n3Mode)
break;
case ',':
case ';':
case '[':
case ']':
case '(':
case ')':
case '}':
if (!this._lineMode) {
matchLength = 1;
type = firstChar;
}
break;
case '{':
// We need at least 2 tokens lookahead to distinguish "{|" and "{ "
if (!this._lineMode && input.length >= 2) {
// Try to find a quoted triple annotation start
if (input[1] === '|')
type = '{|', matchLength = 2;
else
type = firstChar, matchLength = 1;
}
break;
case '|':
// We need 2 tokens lookahead to parse "|}"
// Try to find a quoted triple annotation end
if (input.length >= 2 && input[1] === '}')
type = '|}', matchLength = 2;
break;
default:
inconclusive = true;
}
// Some first characters do not allow an immediate decision, so inspect more
if (inconclusive) {
// Try to find a prefix
if ((this._previousMarker === '@prefix' || this._previousMarker === 'PREFIX') &&
(match = this._prefix.exec(input)))
type = 'prefix', value = match[1] || '';
// Try to find a prefixed name. Since it can contain (but not end with) a dot,
// we always need a non-dot character before deciding it is a prefixed name.
// Therefore, try inserting a space if we're at the end of the input.
else if ((match = this._prefixed.exec(input)) ||
inputFinished && (match = this._prefixed.exec(`${input} `)))
type = 'prefixed', prefix = match[1] || '', value = this._unescape(match[2]);
}
// A type token is special: it can only be emitted after an IRI or prefixed name is read
if (this._previousMarker === '^^') {
switch (type) {
case 'prefixed': type = 'type'; break;
case 'IRI': type = 'typeIRI'; break;
default: type = '';
}
}
// What if nothing of the above was found?
if (!type) {
// We could be in streaming mode, and then we just wait for more input to arrive.
// Otherwise, a syntax error has occurred in the input.
// One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).
if (inputFinished || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
return reportSyntaxError(this);
else
return this._input = input;
}
// Emit the parsed token
const length = matchLength || match[0].length;
const token = emitToken(type, value, prefix, line, length);
this.previousToken = token;
this._previousMarker = type;
// Advance to next part to tokenize
input = input.substr(length, input.length);
}
// Emits the token through the callback
function emitToken(type, value, prefix, line, length) {
const start = input ? currentLineLength - input.length : currentLineLength;
const end = start + length;
const token = { type, value, prefix, line, start, end };
callback(null, token);
return token;
}
// Signals the syntax error through the callback
function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
}
// ### `_unescape` replaces N3 escape codes by their corresponding characters
_unescape(item) {
let invalid = false;
const replaced = item.replace(escapeSequence, (sequence, unicode4, unicode8, escapedChar) => {
// 4-digit unicode character
if (typeof unicode4 === 'string')
return String.fromCharCode(Number.parseInt(unicode4, 16));
// 8-digit unicode character
if (typeof unicode8 === 'string') {
let charCode = Number.parseInt(unicode8, 16);
return charCode <= 0xFFFF ? String.fromCharCode(Number.parseInt(unicode8, 16)) :
String.fromCharCode(0xD800 + ((charCode -= 0x10000) >> 10), 0xDC00 + (charCode & 0x3FF));
}
// fixed escape sequence
if (escapedChar in escapeReplacements)
return escapeReplacements[escapedChar];
// invalid escape sequence
invalid = true;
return '';
});
return invalid ? null : replaced;
}
// ### `_parseLiteral` parses a literal into an unescaped value
_parseLiteral(input) {
// Ensure we have enough lookahead to identify triple-quoted strings
if (input.length >= 3) {
// Identify the opening quote(s)
const opening = input.match(/^(?:"""|"|'''|'|)/)[0];
const openingLength = opening.length;
// Find the next candidate closing quotes
let closingPos = Math.max(this._literalClosingPos, openingLength);
while ((closingPos = input.indexOf(opening, closingPos)) > 0) {
// Count backslashes right before the closing quotes
let backslashCount = 0;
while (input[closingPos - backslashCount - 1] === '\\')
backslashCount++;
// An even number of backslashes (in particular 0)
// means these are actual, non-escaped closing quotes
if (backslashCount % 2 === 0) {
// Extract and unescape the value
const raw = input.substring(openingLength, closingPos);
const lines = raw.split(/\r\n|\r|\n/).length - 1;
const matchLength = closingPos + openingLength;
// Only triple-quoted strings can be multi-line
if (openingLength === 1 && lines !== 0 ||
openingLength === 3 && this._lineMode)
break;
this._line += lines;
return { value: this._unescape(raw), matchLength };
}
closingPos++;
}
this._literalClosingPos = input.length - openingLength + 1;
}
return { value: '', matchLength: 0 };
}
// ### `_syntaxError` creates a syntax error for the given issue
_syntaxError(issue) {
this._input = null;
const err = new Error(`Unexpected "${issue}" on line ${this._line}.`);
err.context = {
token: undefined,
line: this._line,
previousToken: this.previousToken,
};
return err;
}
// ### Strips off any starting UTF BOM mark.
_readStartingBom(input) {
return input.startsWith('\ufeff') ? input.substr(1) : input;
}
// ## Public methods
// ### `tokenize` starts the transformation of an N3 document into an array of tokens.
// The input can be a string or a stream.
tokenize(input, callback) {
this._line = 1;
// If the input is a string, continuously emit tokens through the callback until the end
if (typeof input === 'string') {
this._input = this._readStartingBom(input);
// If a callback was passed, asynchronously call it
if (typeof callback === 'function')
queueMicrotask(() => this._tokenizeToEnd(callback, true));
// If no callback was passed, tokenize synchronously and return
else {
const tokens = [];
let error;
this._tokenizeToEnd((e, t) => e ? (error = e) : tokens.push(t), true);
if (error) throw error;
return tokens;
}
}
// Otherwise, the input must be a stream
else {
this._pendingBuffer = null;
if (typeof input.setEncoding === 'function')
input.setEncoding('utf8');
// Adds the data chunk to the buffer and parses as far as possible
input.on('data', data => {
if (this._input !== null && data.length !== 0) {
// Prepend any previous pending writes
if (this._pendingBuffer) {
data = Buffer.concat([this._pendingBuffer, data]);
this._pendingBuffer = null;
}
// Hold if the buffer ends in an incomplete unicode sequence
if (data[data.length - 1] & 0x80) {
this._pendingBuffer = data;
}
// Otherwise, tokenize as far as possible
else {
// Only read a BOM at the start
if (typeof this._input === 'undefined')
this._input = this._readStartingBom(typeof data === 'string' ? data : data.toString());
else
this._input += data;
this._tokenizeToEnd(callback, false);
}
}
});
// Parses until the end
input.on('end', () => {
if (typeof this._input === 'string')
this._tokenizeToEnd(callback, true);
});
input.on('error', callback);
}
}
}