Skip to content
This repository has been archived by the owner on Aug 31, 2023. It is now read-only.

Commit

Permalink
feat(rome_json_parser): JSON Lexer (#3809)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaReiser committed Nov 23, 2022
1 parent e526178 commit d9c5c14
Show file tree
Hide file tree
Showing 606 changed files with 311,254 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .gitattributes
@@ -1,5 +1,5 @@
* text=auto eol=lf
crates/rome_js_parser/src/lexer/tables.rs linguist-generated=true text=auto eol=lf
crates/rome_js_unicode_table/src/tables.rs linguist-generated=true text=auto eol=lf
**/generated/* linguist-generated=true text=auto eol=lf
crates/rome_js_analyze/src/analyzers.rs linguist-generated=true text=auto eol=lf
crates/rome_js_analyze/src/assists.rs linguist-generated=true text=auto eol=lf
Expand Down
20 changes: 20 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/rome_js_parser/Cargo.toml
Expand Up @@ -12,6 +12,7 @@ rome_console = { path = "../rome_console" }
rome_diagnostics = { path = "../rome_diagnostics" }
rome_js_syntax = { path = "../rome_js_syntax" }
rome_js_factory = { path = "../rome_js_factory" }
rome_js_unicode_table = { path = "../rome_js_unicode_table" }
rome_rowan = { path = "../rome_rowan" }
drop_bomb = "0.1.5"
bitflags = "1.3.2"
Expand Down
20 changes: 4 additions & 16 deletions crates/rome_js_parser/src/lexer/mod.rs
Expand Up @@ -16,31 +16,27 @@
#![allow(clippy::or_fun_call)]

#[rustfmt::skip]
mod tables;
mod errors;
mod tests;

pub mod buffered_lexer;
mod bytes;
#[cfg(feature = "highlight")]
mod highlight;

use bitflags::bitflags;
#[cfg(feature = "highlight")]
pub use highlight::*;

use tables::derived_property::*;

pub(crate) use buffered_lexer::BufferedLexer;
pub use rome_js_syntax::*;

use self::bytes::{
lookup_byte,
Dispatch::{self, *},
};
use crate::ParseDiagnostic;
use rome_diagnostics::location::FileId;
use rome_js_syntax::JsSyntaxKind::*;
use rome_js_unicode_table::{
is_id_continue, is_id_start, lookup_byte,
Dispatch::{self, *},
};

use self::errors::invalid_digits_after_unicode_escape_sequence;

Expand All @@ -61,14 +57,6 @@ const UNICODE_SPACES: [char; 19] = [
'\u{205F}', '\u{3000}', '\u{FEFF}',
];

fn is_id_start(c: char) -> bool {
c == '_' || c == '$' || ID_Start(c)
}

fn is_id_continue(c: char) -> bool {
c == '$' || c == '\u{200d}' || c == '\u{200c}' || ID_Continue(c)
}

/// Context in which the lexer should lex the next token
#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)]
pub enum LexContext {
Expand Down
28 changes: 14 additions & 14 deletions crates/rome_js_parser/src/lib.rs
Expand Up @@ -497,13 +497,13 @@ impl ParseDiagnostic {
/// ## Examples
///
/// ```
/// use rome_console::fmt::{Termcolor};
/// use rome_console::markup;
/// use rome_diagnostics::{DiagnosticExt, FileId, PrintDiagnostic, console::fmt::Formatter};
/// use rome_js_parser::ParseDiagnostic;
/// use rome_js_syntax::TextRange;
/// use rome_rowan::TextSize;
/// use std::fmt::Write;
/// # use rome_console::fmt::{Termcolor};
/// # use rome_console::markup;
/// # use rome_diagnostics::{DiagnosticExt, FileId, PrintDiagnostic, console::fmt::Formatter};
/// # use rome_js_parser::ParseDiagnostic;
/// # use rome_js_syntax::TextRange;
/// # use rome_rowan::TextSize;
/// # use std::fmt::Write;
///
/// let source = "const a";
/// let range = TextRange::new(TextSize::from(0), TextSize::from(5));
Expand Down Expand Up @@ -555,13 +555,13 @@ impl ParseDiagnostic {
/// ## Examples
///
/// ```
/// use rome_console::fmt::{Termcolor};
/// use rome_console::markup;
/// use rome_diagnostics::{DiagnosticExt, FileId, PrintDiagnostic, console::fmt::Formatter};
/// use rome_js_parser::ParseDiagnostic;
/// use rome_js_syntax::TextRange;
/// use rome_rowan::TextSize;
/// use std::fmt::Write;
/// # use rome_console::fmt::{Termcolor};
/// # use rome_console::markup;
/// # use rome_diagnostics::{DiagnosticExt, FileId, PrintDiagnostic, console::fmt::Formatter};
/// # use rome_js_parser::ParseDiagnostic;
/// # use rome_js_syntax::TextRange;
/// # use rome_rowan::TextSize;
/// # use std::fmt::Write;
///
/// let source = "const a";
/// let range = TextRange::new(TextSize::from(0), TextSize::from(5));
Expand Down
12 changes: 12 additions & 0 deletions crates/rome_js_unicode_table/Cargo.toml
@@ -0,0 +1,12 @@
[package]
edition = "2021"
name = "rome_js_unicode_table"
version = "0.0.0"
authors = ["Rome Tools and Contributors"]
license = "MIT"
description = "Unicode table for JavaScript IDs"
repository = "https://github.com/rome/tools"

[dependencies]

[dev-dependencies]
@@ -1,56 +1,117 @@
use Dispatch::*;

pub(crate) fn lookup_byte(byte: u8) -> Dispatch {
// Safety: our lookup table maps all values of u8, so it's impossible for a u8 to be out of bounds
unsafe { *DISPATCHER.get_unchecked(byte as usize) }
}

// Every handler a byte coming in could be mapped to
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
/// Every handler a byte coming in could be mapped to
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
#[repr(u8)]
pub(crate) enum Dispatch {
pub enum Dispatch {
/// Error token
ERR,

/// Whitespace
WHS,

/// Exclamation
EXL,

/// Single `'` or Double quote `"`
QOT,

/// ASCII identifier, or `$`, `_`
IDT,

/// Hash `#`
HAS,

/// Percentage `%`
PRC,

/// Ampersand `&`
AMP,

/// Left paren `(`
PNO,

/// Right paren `)`
PNC,

/// Multiply `*`
MUL,

/// Plus `+`
PLS,

/// Comma `,`
COM,

/// Minus `-`
MIN,

/// Dot `.`
PRD,

/// Slash `/`
SLH,

/// Zero 0
ZER,

/// Digit (1-9)
DIG,

/// Colon `:`
COL,

/// Semicolon `;`
SEM,

///`Less than `<`
LSS,

/// Equal `=`
EQL,

/// More than `>`
MOR,
/// Question `?`
QST,
/// At `@`
AT_,

/// Left bracket `[`
BTO,

/// Backslash `\`
BSL,

/// Right bracket `]`
BTC,

/// `^`
CRT,

/// Tick `
TPL,

/// Left curly bracket `{`
BEO,

/// Pipe `|`
PIP,

/// Right curly bracket `}`
BEC,

/// Tilde `~`
TLD,

/// Unicode range (non ASCII)
UNI,
}

// A lookup table mapping any incoming byte to a handler function
// This is taken from the ratel project lexer and modified
// FIXME: Should we ignore the first ascii control chars which are nearly never seen instead of returning Err?
static DISPATCHER: [Dispatch; 256] = [
pub(crate) static DISPATCHER: [Dispatch; 256] = [
//0 1 2 3 4 5 6 7 8 9 A B C D E F //
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, WHS, WHS, WHS, WHS, WHS, ERR, ERR, // 0
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1
Expand Down
26 changes: 26 additions & 0 deletions crates/rome_js_unicode_table/src/lib.rs
@@ -0,0 +1,26 @@
use crate::bytes::DISPATCHER;
use crate::tables::derived_property::{ID_Continue, ID_Start};

mod bytes;
mod tables;

pub use crate::bytes::Dispatch;

/// Tests if `c` is a valid start of an identifier
#[inline]
pub fn is_id_start(c: char) -> bool {
c == '_' || c == '$' || ID_Start(c)
}

/// Tests if `c` is a valid continuation of an identifier.
#[inline]
pub fn is_id_continue(c: char) -> bool {
c == '$' || c == '\u{200d}' || c == '\u{200c}' || ID_Continue(c)
}

/// Looks up a byte in the lookup table.
#[inline]
pub fn lookup_byte(byte: u8) -> Dispatch {
// Safety: the lookup table maps all values of u8, so it's impossible for a u8 to be out of bounds
unsafe { *DISPATCHER.get_unchecked(byte as usize) }
}

0 comments on commit d9c5c14

Please sign in to comment.