Skip to content
Turns a string into tokens.
Branch: master
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
test
.gitignore
LICENSE
README.md
TokenizeThis.js
gulpfile.js
package.json
tokenize-this.min.js

README.md

TokenizeThis

Quickstart

It turns a string into tokens.

var tokenizer = new TokenizeThis();
var str = 'Tokenize this!';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Tokenize', 'this', '!']);

By default, it can tokenize math-based strings.

var tokenizer = new TokenizeThis();
var str = '5 + 6 -(4/2) + gcd(10, 5)';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});

equals(tokens, [5, '+', 6, '-', '(', 4, '/', 2, ')', '+', 'gcd', '(', 10, ',', 5, ')']);

...Or SQL.

var tokenizer = new TokenizeThis();
var str = 'SELECT COUNT(id), 5+6 FROM `users` WHERE name = "shaun persad" AND hobby IS NULL';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    
    if (surroundedBy) {
        tokens.push(surroundedBy+token+surroundedBy);
    } else {
        tokens.push(token);
    }
});
equals(tokens, [
    'SELECT',
    'COUNT', '(', 'id', ')',
    ',',
    5, '+', 6,
    'FROM', '`users`',
    'WHERE',
    'name', '=', '"shaun persad"',
    'AND',
    'hobby', 'IS', null
]);

Installation

npm install tokenize-this.

// or if in the browser: <script src="tokenize-this/tokenize-this.min.js"></script>

Usage

require it, create a new instance, then call tokenize.

// var TokenizeThis = require('tokenize-this');
// OR
// var TokenizeThis = require('tokenize-this/tokenize-this.min.js'); // for node.js < 4.0
// OR
// <script src="tokenize-this/tokenize-this.min.js"></script> <!-- if in browser -->

var tokenizer = new TokenizeThis();

var str = 'Hi!, I want to add 5+6';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Hi', '!', ',', 'I', 'want', 'to', 'add', 5, '+', 6]);

Advanced Usage

Supplying a config object to the constructor

See here for all options

This can be used to tokenize many forms of data, like JSON into key-value pairs.

var jsonConfig = {
    shouldTokenize: ['{', '}', '[', ']'],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t", ':', ','],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(jsonConfig);
var str = '[{name:"Shaun Persad", id: 5}, { gender : null}]';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['[', '{', 'name', 'Shaun Persad', 'id', 5, '}', '{', 'gender', null, '}', ']']);

Here it is tokenizing XML like a boss.

var xmlConfig = {
    shouldTokenize: ['<?', '?>', '<!', '<', '</', '>', '/>', '='],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(xmlConfig);
var str = `
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
   <product description="Cardigan Sweater" product_image="cardigan.jpg">
      <size description="Large" />
      <color_swatch image="red_cardigan.jpg">
        Red
      </color_swatch>
   </product>
</catalog>                
`;
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens,
    [
        '<?', 'xml-stylesheet', 'href', '=', 'catalog.xsl', 'type', '=', 'text/xsl', '?>',
        '<!', 'DOCTYPE', 'catalog', 'SYSTEM', 'catalog.dtd', '>',
        '<', 'catalog', '>',
        '<', 'product', 'description', '=', 'Cardigan Sweater', 'product_image', '=', 'cardigan.jpg', '>',
        '<', 'size', 'description', '=', 'Large', '/>',
        '<', 'color_swatch', 'image', '=', 'red_cardigan.jpg', '>',
        'Red',
        '</', 'color_swatch', '>',
        '</', 'product', '>',
        '</', 'catalog', '>'
    ]
);

The above examples are the first steps in writing parsers for those formats. The next would be parsing the stream of tokens based on the format-specific rules, e.g. SQL.

API

Methods

#tokenize(str:String, forEachToken:Function)

sends each token to the forEachToken(token:String, surroundedBy:String) callback.

var tokenizer = new TokenizeThis();
var str = 'Tokenize "this"!';
var tokens = [];
var forEachToken = function(token, surroundedBy) {
    tokens.push(surroundedBy+token+surroundedBy);
};
tokenizer.tokenize(str, forEachToken);
equals(tokens, ['Tokenize', '"this"', '!']);

it converts true, false, null, and numbers into their literal versions.

var tokenizer = new TokenizeThis();
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, [true, false, null, true, false, null, 1, 2, 3.4, 5.6789]);

.defaultConfig:Object

The default config object used when no config is supplied.

var config = {
    shouldTokenize: ['(', ')', ',', '*', '/', '%', '+', '-', '=', '!=', '!', '<', '>', '<=', '>=', '^'],
    shouldMatch: ['"', "'", '`'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true,
    escapeCharacter: "\\"
};
equals(TokenizeThis.defaultConfig, config);

You can change converting to literals with the convertLiterals config option.

var config = {
    convertLiterals: false
};
var tokenizer = new TokenizeThis(config);
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['true', 'false', 'null', 'TRUE', 'FALSE', 'NULL', '1', '2', '3.4', '5.6789']);

Any strings surrounded by the quotes specified in the shouldMatch option are treated as whole tokens.

var config = {
    shouldMatch: ['"', '`', '#']
};
var tokenizer = new TokenizeThis(config);
var str = '"hi there" `this is a test` #of quotes#';
var tokens = [];
var tokensQuoted = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
    tokensQuoted.push(surroundedBy+token+surroundedBy);
});
equals(tokens, ['hi there', 'this is a test', 'of quotes']);
equals(tokensQuoted, ['"hi there"', '`this is a test`', '#of quotes#']);

Quotes can be escaped via a backslash.

var tokenizer = new TokenizeThis();
var str = 'These are "\\"quotes\\""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});

equals(tokens, ['These', 'are', '"quotes"']);

The escape character can be specified with the escapeCharacter option.

var config = {
    escapeCharacter: '#'
};
var tokenizer = new TokenizeThis(config);
var str = 'These are "#"quotes#""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);
You can’t perform that action at this time.