Permalink
Browse files

Initial commit

  • Loading branch information...
0 parents commit ed20f90f02900cfac093e23351d28d98d190e8dd @kmcallister kmcallister committed Mar 11, 2014
Showing with 344 additions and 0 deletions.
  1. +28 −0 extract-from-spec.py
  2. +15 −0 html5.rs
  3. +165 −0 tokenizer/mod.rs
  4. +70 −0 tokenizer/states.rs
  5. +66 −0 tokenizer/tokens.rs
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+import re
+from bs4 import BeautifulSoup
+
+# Extract information from the WHATWG webapp spec.
+
+def parse_spec():
+ with file('webapps.html') as f:
+ soup = BeautifulSoup(f)
+
+ return {
+ 'tokenization': soup.find(text='Tokenization').find_parent('div'),
+ }
+
+def extract_tokenizer_states(spec):
+ with file('tokenizer/states.rs', 'w') as f:
+ f.write('pub enum State {\n');
+
+ for statedefn in spec['tokenization'].select('h5 > dfn'):
+ statename = statedefn.text.lower()
+ assert statename[-5:] == 'state'
+ words = re.sub(r'[^a-z]', ' ', statename[:-5]).split()
+ f.write(' %s,\n' % (''.join(w.title() for w in words),))
+
+ f.write('}\n')
+
+spec = parse_spec()
+extract_tokenizer_states(spec)
@@ -0,0 +1,15 @@
+pub mod tokenizer;
+
+struct TokenPrinter;
+
+impl tokenizer::TokenSink for TokenPrinter {
+ fn process_token(&mut self, token: tokenizer::Token) {
+ println!("{:?}", token);
+ }
+}
+
+fn main() {
+ let mut sink = TokenPrinter;
+ let mut tok = tokenizer::Tokenizer::new(&mut sink);
+ tok.feed("<div>Hello, world!</div>");
+}
@@ -0,0 +1,165 @@
+pub use self::tokens::{Doctype, Attributes, TagKind, StartTag, EndTag, Tag, Token};
+pub use self::tokens::{DoctypeToken, TagToken, CommentToken, CharacterToken};
+
+mod tokens;
+mod states;
+
+pub trait TokenSink {
+ fn process_token(&mut self, token: Token);
+}
+
+fn letter_to_ascii_lowercase(c: char) -> Option<char> {
+ c.to_ascii_opt()
+ .filtered(|a| a.is_alpha())
+ .map(|a| a.to_lower().to_char())
+}
+
+
+pub struct Tokenizer<'sink, Sink> {
+ priv sink: &'sink mut Sink,
+ priv state: states::State,
+
+ // FIXME: The state machine guarantees the tag exists when
+ // we need it, so we could eliminate the Option overhead.
+ // Leaving it as Option for now, to find bugs.
+ priv current_tag: Option<Tag>,
+}
+
+#[deriving(Eq)]
+enum ConsumeCharResult {
+ Reconsume,
+ Finished,
+}
+
+impl<'sink, Sink: TokenSink> Tokenizer<'sink, Sink> {
+ pub fn new(sink: &'sink mut Sink) -> Tokenizer<'sink, Sink> {
+ Tokenizer {
+ sink: sink,
+ state: states::Data,
+ current_tag: None,
+ }
+ }
+
+ pub fn feed(&mut self, input: &str) {
+ debug!("feeding {:s}", input);
+ let mut it = input.chars(); //.peekable();
+ loop {
+ match self.state {
+ // These states do something other than consume a single character.
+ states::CharacterReferenceInData | states::CharacterReferenceInRcdata
+ | states::CharacterReferenceInAttributeValue | states::BogusComment
+ | states::MarkupDeclarationOpen | states::CdataSection => {
+ fail!("FIXME: state {:?} not implemented", self.state);
+ }
+
+ _ => {
+ match it.next() {
+ None => return,
+ Some(c) => {
+ while self.process_char(c) == Reconsume {
+ // reconsume
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ fn emit(&mut self, token: Token) {
+ self.sink.process_token(token);
+ }
+
+ fn create_tag(&mut self, kind: TagKind, c: char) {
+ assert!(self.current_tag.is_none());
+ let mut t = Tag::new(kind);
+ t.name.push_char(c);
+ self.current_tag = Some(t);
+ }
+
+ fn append_to_tag_name(&mut self, c: char) {
+ self.current_tag.get_mut_ref().name.push_char(c);
+ }
+
+ // FIXME: explicitly represent the EOF character?
+ // For now the plan is to handle EOF in a separate match.
+ fn process_char(&mut self, c: char) -> ConsumeCharResult {
+ let parse_error = || {
+ error!("Parse error: saw {:?} in state {:?}", c, self.state);
+ };
+
+ debug!("Processing {:?} in state {:?}", c, self.state);
+ match self.state {
+ states::Data => match c {
+ '&' => { self.state = states::CharacterReferenceInData; }
+ '<' => { self.state = states::TagOpen; }
+ '\0' => {
+ parse_error();
+ self.emit(CharacterToken('\0'));
+ }
+ _ => { self.emit(CharacterToken(c)); }
+ },
+
+ states::TagOpen => match c {
+ '!' => { self.state = states::MarkupDeclarationOpen; }
+ '/' => { self.state = states::EndTagOpen; }
+ '?' => {
+ parse_error();
+ self.state = states::BogusComment;
+ }
+ _ => match letter_to_ascii_lowercase(c) {
+ Some(cl) => {
+ self.create_tag(StartTag, cl);
+ self.state = states::TagName;
+ }
+ None => {
+ parse_error();
+ self.emit(CharacterToken('<'));
+ self.state = states::Data;
+ return Reconsume;
+ }
+ }
+ },
+
+ states::TagName => match c {
+ '\t' | '\n' | '\x0C' | ' ' => { self.state = states::BeforeAttributeName; }
+ '/' => { self.state = states::SelfClosingStartTag; }
+ '>' => {
+ let tok = self.current_tag.take().unwrap();
+ self.emit(TagToken(tok));
+ self.state = states::Data;
+ }
+ '\0' => {
+ parse_error();
+ self.append_to_tag_name('\ufffd');
+ }
+ _ => match letter_to_ascii_lowercase(c) {
+ Some(cl) => { self.append_to_tag_name(cl); }
+ None => { self.append_to_tag_name(c); }
+ }
+ },
+
+ states::EndTagOpen => match c {
+ '>' => {
+ parse_error();
+ self.state = states::Data;
+ }
+ _ => match letter_to_ascii_lowercase(c) {
+ Some(cl) => {
+ self.create_tag(EndTag, cl);
+ self.state = states::TagName;
+ }
+ None => {
+ parse_error();
+ self.state = states::BogusComment;
+ }
+ }
+ },
+
+ s => fail!("FIXME: state {:?} not implemented", s),
+ }
+
+ Finished
+
+ }
+}
@@ -0,0 +1,70 @@
+pub enum State {
+ Data,
+ CharacterReferenceInData,
+ Rcdata,
+ CharacterReferenceInRcdata,
+ Rawtext,
+ ScriptData,
+ Plaintext,
+ TagOpen,
+ EndTagOpen,
+ TagName,
+ RcdataLessThanSign,
+ RcdataEndTagOpen,
+ RcdataEndTagName,
+ RawtextLessThanSign,
+ RawtextEndTagOpen,
+ RawtextEndTagName,
+ ScriptDataLessThanSign,
+ ScriptDataEndTagOpen,
+ ScriptDataEndTagName,
+ ScriptDataEscapeStart,
+ ScriptDataEscapeStartDash,
+ ScriptDataEscaped,
+ ScriptDataEscapedDash,
+ ScriptDataEscapedDashDash,
+ ScriptDataEscapedLessThanSign,
+ ScriptDataEscapedEndTagOpen,
+ ScriptDataEscapedEndTagName,
+ ScriptDataDoubleEscapeStart,
+ ScriptDataDoubleEscaped,
+ ScriptDataDoubleEscapedDash,
+ ScriptDataDoubleEscapedDashDash,
+ ScriptDataDoubleEscapedLessThanSign,
+ ScriptDataDoubleEscapeEnd,
+ BeforeAttributeName,
+ AttributeName,
+ AfterAttributeName,
+ BeforeAttributeValue,
+ AttributeValueDoubleQuoted,
+ AttributeValueSingleQuoted,
+ AttributeValueUnquoted,
+ CharacterReferenceInAttributeValue,
+ AfterAttributeValueQuoted,
+ SelfClosingStartTag,
+ BogusComment,
+ MarkupDeclarationOpen,
+ CommentStart,
+ CommentStartDash,
+ Comment,
+ CommentEndDash,
+ CommentEnd,
+ CommentEndBang,
+ Doctype,
+ BeforeDoctypeName,
+ DoctypeName,
+ AfterDoctypeName,
+ AfterDoctypePublicKeyword,
+ BeforeDoctypePublicIdentifier,
+ DoctypePublicIdentifierDoubleQuoted,
+ DoctypePublicIdentifierSingleQuoted,
+ AfterDoctypePublicIdentifier,
+ BetweenDoctypePublicAndSystemIdentifiers,
+ AfterDoctypeSystemKeyword,
+ BeforeDoctypeSystemIdentifier,
+ DoctypeSystemIdentifierDoubleQuoted,
+ DoctypeSystemIdentifierSingleQuoted,
+ AfterDoctypeSystemIdentifier,
+ BogusDoctype,
+ CdataSection,
+}
@@ -0,0 +1,66 @@
+use std::hashmap::HashMap;
+use std::str;
+
+// FIXME: already exists in Servo DOM
+pub struct Doctype {
+ name: Option<~str>,
+ public_id: Option<~str>,
+ system_id: Option<~str>,
+ force_quirks: bool,
+}
+
+impl Doctype {
+ pub fn new() -> Doctype {
+ Doctype {
+ name: None,
+ public_id: None,
+ system_id: None,
+ force_quirks: false,
+ }
+ }
+}
+
+
+pub struct Attributes {
+ data: HashMap<~str, ~str>,
+}
+
+impl Attributes {
+ pub fn new() -> Attributes {
+ Attributes {
+ data: HashMap::new(),
+ }
+ }
+}
+
+pub enum TagKind {
+ StartTag,
+ EndTag,
+}
+
+pub struct Tag {
+ kind: TagKind,
+ name: ~str,
+ self_closing: bool,
+ attrs: Attributes,
+}
+
+impl Tag {
+ pub fn new(kind: TagKind) -> Tag {
+ Tag {
+ kind: kind,
+ name: str::with_capacity(8), // FIXME: justify this
+ self_closing: false,
+ attrs: Attributes::new(),
+ }
+ }
+}
+
+
+pub enum Token {
+ DoctypeToken(Doctype),
+ TagToken(Tag),
+ CommentToken(~str),
+ CharacterToken(char),
+ EOFToken,
+}

0 comments on commit ed20f90

Please sign in to comment.