Skip to content
Browse files

Initial checkin.

  • Loading branch information...
0 parents commit ddeac878df59a4e529c9a68a559d96ea991b7fc2 @sfrancisx committed May 3, 2012
Showing with 2,162 additions and 0 deletions.
  1. +649 −0 DupFind.java
  2. +4 −0 Makefile
  3. +100 −0 README
  4. +363 −0 Token.java
  5. +650 −0 Tokenizer.java
  6. +317 −0 Utils.java
  7. +4 −0 dupfind
  8. +73 −0 dupfind.cfg
  9. +2 −0 dupfind.inf
  10. BIN dupfind.jar
649 DupFind.java
@@ -0,0 +1,649 @@
+/*
+Copyright (c) 2012, Yahoo! Inc. All rights reserved.
+
+Redistribution and use of this software in source and binary forms,
+with or without modification, are permitted provided that the following
+conditions are met:
+
+* Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+* Neither the name of Yahoo! Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior
+ written permission of Yahoo! Inc.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+import java.io.*;
+import java.util.*;
+
+class DupFind
+{
+ ///////////////////////////////////////////////////////////////////////////
+ public static void main(String[] args)
+ throws IOException, Throwable
+ {
+ int i,
+ totalTokens = 0;
+
+ // Read the config file
+ Config cfg = readConfigFile();
+
+ // args.length != 0 --> Only process sources/files from the command line
+ if (args.length != 0)
+ {
+ for (i = 0; i < cfg.sources.length; i++)
+ cfg.sources[i].def = false;
+
+ for (i = 0; i < args.length; i++)
+ {
+ boolean found = false;
+ for (int j = 0; j < cfg.sources.length; j++)
+ {
+ String name = cfg.sources[j].name;
+ if (name != null && name.compareTo(args[i]) == 0)
+ {
+ found = true;
+ cfg.sources[j].def = true;
+ }
+ }
+
+ if (!found)
+ cfg.addSource(args[i]);
+ }
+
+ }
+
+ String info = "";
+ for (i = 0; i < cfg.sources.length; i++)
+ {
+ if (cfg.sources[i].def)
+ {
+ if (info.length() > 0)
+ info += ", ";
+ info += cfg.sources[i].name;
+ }
+ }
+ System.out.println("Reading source from " + info);
+
+ Hashtable paths = new Hashtable();
+
+ // Read all the sources
+ for (i = 0; i < cfg.sources.length; i++)
+ {
+ if (cfg.sources[i].def)
+ totalTokens += readFiles(paths, cfg.sources[i]);
+ }
+
+ int totalMatches = 0;
+ int numToCheck = cfg.max;
+ while (numToCheck >= cfg.min)
+ {
+ totalMatches += checkForDuplicatedCodeSegments(paths, numToCheck, cfg.fuzzy);
+ numToCheck -= cfg.increment;
+ }
+
+ int matchedTokens = 0;
+
+ ArrayList a = new ArrayList();
+ Enumeration e = paths.elements();
+ JsFile f = null;
+ while (e.hasMoreElements())
+ {
+ f = (JsFile)e.nextElement();
+ a.add(f);
+ }
+
+ System.out.println("\n\nDuplicated tokens by file:\n--------------------------");
+ Collections.sort(a, f);
+ for (i = 0; i < a.size(); i++)
+ {
+ f = (JsFile)a.get(i);
+
+ matchedTokens += f.duplicated;
+ if (f.duplicated > 0)
+ System.out.println(f.duplicated + ":" + f.path);
+ }
+
+ float perc = (int)(matchedTokens * 1000 / totalTokens);
+ System.out.println("\nFound " + totalMatches + " duplicated segments. "
+ + matchedTokens + " duplicated tokens out of " + totalTokens + " total (" + (perc/10) + "%).");
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static int checkForDuplicatedCodeSegments(Hashtable paths, int tokensToMatch, boolean fuzzy)
+ {
+ Hashtable segments = new Hashtable();
+
+ int i;
+
+ // Enumerate over all files, computing a hash for every segment of 'tokensToMatch' consecutive tokens
+ Enumeration e = paths.elements();
+ while (e.hasMoreElements())
+ {
+ JsFile m = (JsFile)e.nextElement();
+
+ Token[] tokens = m.tokens;
+
+ int tp = 1;
+ int ts = 0;
+ if (tokens.length < tokensToMatch)
+ continue;
+
+ for (i = 0; i < tokensToMatch-1; i++)
+ {
+ tp *= tokens[i].type; // tp => token product. The product of all token types
+ ts += tokens[i].type; // ts => token sum. The sum of all token types
+ }
+
+ do
+ {
+ ts += tokens[i].type;
+ tp *= tokens[i].type;
+
+ CodeSegment cs = new CodeSegment(m, i+1-tokensToMatch, ts, tp);
+
+ ArrayList a = (ArrayList)segments.get(ts*tp + ts + tp);
+ if (a == null)
+ {
+ a = new ArrayList();
+ segments.put(ts*tp + ts + tp, a);
+ }
+ a.add(cs);
+
+ i++;
+
+ ts -= tokens[i-tokensToMatch].type;
+ tp /= tokens[i-tokensToMatch].type;
+
+ } while (i < tokens.length);
+ }
+
+ int totalMatches = 0;
+ e = segments.elements();
+ while (e.hasMoreElements())
+ {
+ ArrayList a = (ArrayList)e.nextElement();
+
+ //Utils.debug("Completed " + count + " of " + segments.size() + ". Next length = " + a.size() + ". Matches so far: " + totalMatches);
+
+ for (i = 0; i < a.size()-1; i++)
+ {
+ CodeSegment cs1 = (CodeSegment)a.get(i);
+ if (cs1.foundDup)
+ continue;
+
+ boolean reported = false;
+ for (int j = 0; j < tokensToMatch; j++)
+ {
+ if (cs1.info.reported[cs1.offset + j])
+ {
+ reported = true;
+ break;
+ }
+ }
+ if (reported)
+ continue;
+
+ Token[] t1 = cs1.info.tokens;
+ String dup = "";
+ int count = 0;
+
+ for (int j = i+1; j < a.size(); j++)
+ {
+ CodeSegment cs2 = (CodeSegment)a.get(j);
+
+ if (cs1.tokenSum != cs2.tokenSum || cs1.tokenProduct != cs2.tokenProduct)
+ continue;
+
+ Token[] t2 = cs2.info.tokens;
+
+ boolean match = true;
+ boolean prevDot = false;
+ for (int k = 0; k < tokensToMatch; k++)
+ {
+ if (cs2.info.reported[cs2.offset+k])
+ {
+ match = false;
+ break;
+ }
+
+ Token token1 = t1[cs1.offset+k];
+ Token token2 = t2[cs2.offset+k];
+
+ if (token1.type != token2.type)
+ {
+ match = false;
+ break;
+ }
+
+ if (fuzzy)
+ {
+ if (!prevDot && token1.type == Token.NAME)
+ continue;
+ }
+
+ if (token1.src.compareTo(token2.src) != 0)
+ {
+ match = false;
+ break;
+ }
+
+ prevDot = false;
+ if (token1.type == Token.DOT)
+ prevDot = true;
+ }
+
+ if (match)
+ {
+ if (!cs1.foundDup)
+ {
+ cs1.foundDup = true;
+ String code = dumpCode(cs1, tokensToMatch, count == 0);
+ count++;
+ dup = code + "\n" + dup;
+ }
+
+ cs2.foundDup = true;
+ String code = dumpCode(cs2, tokensToMatch, count == 0);
+ count++;
+ dup = code + "\n" + dup;
+ }
+ }
+
+ if (count > 1)
+ {
+ totalMatches += count;
+ System.out.print("-------- " + tokensToMatch + " tokens --------\n" + dup);
+ }
+ }
+ }
+
+ return totalMatches;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static String dumpCode(CodeSegment cs, int len, boolean includeSrc)
+ {
+ JsFile f = cs.info;
+
+ f.duplicated += len;
+
+ String src = "";
+ for (int i = 0; i < len; i++)
+ {
+ f.reported[cs.offset + i] = true;
+
+ src += f.tokens[cs.offset + i].src;
+ int type = f.tokens[cs.offset + i].type;
+ if (type == Token.FUNCTION || type == Token.RETURN || type == Token.VAR)
+ src += " ";
+ }
+
+ if (includeSrc)
+ return f.path + ", line " + f.tokens[cs.offset].lineNum + "\n" + src;
+
+ return f.path + ", line " + f.tokens[cs.offset].lineNum;
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static int readFiles(Hashtable paths, Source s)
+ throws Throwable
+ {
+ int totalTokens = 0;
+
+ if (s.directories.length == 0)
+ totalTokens = readFilesFromDir(paths, s.root, s);
+ else
+ {
+ for (int i = 0; i < s.directories.length; i++)
+ totalTokens += readFilesFromDir(paths, Utils.getPath(s.root, s.directories[i]), s);
+ }
+
+ return totalTokens;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static int readFilesFromDir(Hashtable paths, String dir, Source s)
+ throws Throwable
+ {
+ int totalTokens = 0;
+
+ File f = new File(dir);
+
+ java.util.regex.Pattern[] include = s.include;
+ java.util.regex.Pattern[] exclude = s.exclude;
+
+ for (int i = 0; i < exclude.length; i++)
+ {
+ java.util.regex.Matcher m = exclude[i].matcher(f.getCanonicalPath());
+ if (m.matches())
+ {
+ //System.out.println("Excluding " + dir);
+ return totalTokens;
+ }
+ }
+
+ if (f.isDirectory())
+ {
+ String[] files = f.list();
+ if (files != null)
+ {
+ for (int i = 0; i < files.length; i++)
+ totalTokens += readFilesFromDir(paths, Utils.getPath(dir, files[i]), s);
+ }
+ }
+ else
+ {
+ if (include != null)
+ {
+ boolean includeFile = false;
+ for (int i = 0; i < include.length; i++)
+ {
+ java.util.regex.Matcher m = include[i].matcher(f.getName());
+ if (m.matches())
+ {
+ includeFile = true;
+ break;
+ }
+ }
+
+ if (!includeFile)
+ {
+ //System.out.println("Not including file " + f.getName());
+ return totalTokens;
+ }
+
+ //System.out.println("Including " + f.getName());
+ }
+
+ dir = f.getCanonicalPath();
+
+ if (paths.get(dir) == null)
+ totalTokens += readFile(dir, paths);
+ else
+ System.out.println("File specified twice: " + dir);
+ }
+
+ return totalTokens;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static int readFile(String name, Hashtable paths)
+ throws Throwable
+ {
+ Token[] tokens = Utils.getTokens(name);
+ if (tokens == null)
+ return 0;
+
+ JsFile jsfile = new JsFile(name, tokens);
+ paths.put(name, jsfile);
+
+ return tokens.length;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static Config readConfigFile()
+ throws Throwable
+ {
+ // Read the config file into a generic representation
+ Token[] tokens = Utils.getTokens("dupfind.cfg");
+ if (tokens == null)
+ tokens = Utils.getTokens("~/.dupfind.cfg");
+ if (tokens == null)
+ tokens = Utils.getTokens("/home/y/conf/dupfind/dupfind.cfg");
+ if (tokens == null)
+ return new Config();
+
+ Hashtable obj = new Hashtable();
+ Utils.parseObject(tokens, 0, tokens.length, obj);
+
+ // Put the generic representation into something easier to use
+ return new Config(obj);
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+class JsFile
+ implements Comparator
+{
+ public JsFile(String path, Token[] tokens)
+ {
+ this.path = path;
+ this.tokens = tokens;
+
+ reported = new boolean[tokens.length];
+
+ duplicated = 0;
+ }
+
+ public int compare(Object o1, Object o2)
+ {
+ return (((JsFile)o2).duplicated - ((JsFile)o1).duplicated);
+ }
+
+ public String path;
+ public Token[] tokens;
+ public boolean[] reported;
+
+ public int duplicated;
+}
+
+///////////////////////////////////////////////////////////////////////////
+class CodeSegment
+{
+ public CodeSegment(JsFile file, int offset, int tokenSum, int tokenProduct)
+ {
+ this.info = file;
+ this.offset = offset;
+ this.tokenSum = tokenSum;
+ this.tokenProduct = tokenProduct;
+ }
+
+ public JsFile info;
+ public int offset;
+ public int tokenSum;
+ public int tokenProduct;
+ public boolean foundDup = false;
+}
+
+///////////////////////////////////////////////////////////////////////////
+class Config
+{
+ public Config(Hashtable obj)
+ {
+ init();
+
+ String s = (String)obj.get("min");
+ if (s != null)
+ min = Integer.decode(s);
+
+ s = (String)obj.get("max");
+ if (s != null)
+ max = Integer.decode(s);
+
+ s = (String)obj.get("increment");
+ if (s != null)
+ increment = Integer.decode(s);
+
+ s = (String)obj.get("fuzzy");
+ if (s == null || s.compareTo("0") == 0 || s.compareTo("false") == 0)
+ fuzzy = false;
+ else
+ fuzzy = true;
+
+ ArrayList sources = (ArrayList)obj.get("sources");
+
+ this.sources = new Source[sources.size()];
+
+ boolean anyDefault = false;
+
+ for (int i = 0; i < this.sources.length; i++)
+ {
+ this.sources[i] = new Source((Hashtable)sources.get(i));
+ anyDefault |= this.sources[i].def;
+ }
+
+ if (!anyDefault)
+ {
+ for (int i = 0; i < this.sources.length; i++)
+ this.sources[i].def = true;
+ }
+ }
+
+ public Config()
+ {
+ init();
+
+ sources = new Source[1];
+ sources[0] = new Source();
+ }
+
+ public void addSource(String source)
+ {
+ Source[] oldSources = sources;
+ sources = new Source[sources.length+1];
+ for (int i = 0; i < oldSources.length; i++)
+ sources[i] = oldSources[i];
+
+ sources[sources.length-1] = new Source(source);
+ }
+
+ void init()
+ {
+ min = 30;
+ max = 500;
+ increment = 10;
+ fuzzy = false;
+ }
+
+ int min;
+ int max;
+ int increment;
+ boolean fuzzy;
+
+ Source[] sources;
+}
+
+///////////////////////////////////////////////////////////////////////////
+class Source
+{
+ public Source(Hashtable action)
+ {
+ this.root = (String)action.get("root");
+
+ this.name = (String)action.get("name");
+
+ this.def = true;
+ String def = (String)action.get("def");
+ if (def == null || def.compareTo("0") == 0 || def.compareTo("false") == 0)
+ this.def = false;
+
+ ArrayList dirs = (ArrayList)action.get("directories");
+ if (dirs == null)
+ directories = new String[0];
+ else
+ {
+ directories = new String[dirs.size()];
+ for (int i = 0; i < dirs.size(); i++)
+ directories[i] = (String)dirs.get(i);
+ }
+
+ ArrayList inc = (ArrayList)action.get("include");
+ if (inc != null)
+ {
+ include = new java.util.regex.Pattern[inc.size()];
+ for (int i = 0; i < inc.size(); i++)
+ include[i] = getFilePattern((String)inc.get(i));
+ }
+
+ ArrayList exc = (ArrayList)action.get("exclude");
+ if (exc == null)
+ exclude = new java.util.regex.Pattern[0];
+ else
+ {
+ exclude = new java.util.regex.Pattern[exc.size()];
+ for (int i = 0; i < exc.size(); i++)
+ exclude[i] = getFilePattern((String)exc.get(i));
+ }
+ }
+
+ public Source()
+ {
+ root = ".";
+ directories = new String[0];
+ include = new java.util.regex.Pattern[1];
+ include[0] = getFilePattern("*.js");
+
+ exclude = new java.util.regex.Pattern[0];
+
+ def = true;
+ }
+
+ public Source(String pattern)
+ {
+ name = pattern;
+ directories = new String[0];
+ include = new java.util.regex.Pattern[1];
+
+ int index = pattern.lastIndexOf('/');
+
+ if (index == pattern.length() - 1)
+ {
+ root = pattern;
+ include[0] = getFilePattern("*.js");
+ }
+ else if (index != -1)
+ {
+ root = pattern.substring(0, index);
+ include[0] = getFilePattern(pattern.substring(index+1));
+ }
+ else
+ {
+ root = ".";
+ include[0] = getFilePattern(pattern);
+ }
+
+ exclude = new java.util.regex.Pattern[0];
+
+ def = true;
+ }
+
+ /////////////////////////////////////////////////////////////////////
+ static java.util.regex.Pattern getFilePattern(String pattern)
+ {
+ // convert pattern from a file pattern to a regex,
+ // i.e. * -> .* ? -> . . -> \.
+ pattern = pattern.replace(".", "\\.");
+ pattern = pattern.replace("*", ".*");
+ pattern = pattern.replace("?", ".");
+
+ return java.util.regex.Pattern.compile(pattern);
+ }
+
+ public String name;
+ public String root;
+ public String[] directories;
+ public java.util.regex.Pattern[] include;
+ public java.util.regex.Pattern[] exclude;
+ public boolean def;
+}
4 Makefile
@@ -0,0 +1,4 @@
+dupfind.jar: *.java
+ javac DupFind.java Tokenizer.java
+ jar -cmvf dupfind.inf dupfind.jar *.class
+ rm -f *.class
100 README
@@ -0,0 +1,100 @@
+Version 1.0.1
+
+dupfind: Scan JavaScript code and find duplicated sections.
+
+Synopsis
+--------
+dupfind [file_set ...] [file_pattern ...]
+
+Description
+-----------
+dupfind will read multiple JS files and look for duplicated segments of code.
+It matches tokens, so it will ignore comments and it's insensitive to
+whitespace. It can do a fuzzy match, which will also ignore changes to
+variable names. You can define filesets in a configuration file, for often
+used files.
+
+When run with no arguments, default filesets from the configuration file will
+be scanned. When run with arguments, the arguments are either the names of
+filesets in the configuration file, or file patterns to match.
+
+Configuration
+-------------
+You can create a configuration file for dupfind to use. dupfind will search
+for the configuration file in these locations, in this order:
+
+ ./dupfind.cfg
+ ~/.dupfind.cfg
+ /home/y/conf/dupfind/dupfind.cfg
+
+If it doesn't find a configuration file, it will search all .js files in or
+below the current directory.
+
+The configuration file should contain JSON that looks like this:
+
+{
+ min: 30,
+ max: 500,
+ increment: 10,
+ fuzzy: true,
+
+ sources:
+ [
+ {
+ name: "yui",
+ def: true,
+ root: "/home/y/share/htdocs/yui3",
+ directories:
+ [
+ "build"
+ ],
+ include:
+ [
+ "*.js"
+ ],
+ exclude:
+ [
+ "*/.svn",
+ "*simpleyui.js",
+ "*-[^/]*.js",
+ "*yui.js",
+ "*datatype*"
+ ]
+ }
+ ]
+}
+
+min: The minimum number of consecutive duplicated tokens required to
+ report the duplication.
+max: The maximim number of consecutive tokens to check
+increment: dupfind looks for duplication multiple times. The first time, it
+ looks for 'max' consecutive tokens. It reduces the number by
+ 'increment' and checks again (I guess 'increment' should really be
+ 'decrement'...) It continues until the number being checked is
+ less than 'min'.
+fuzzy: 'true' to do a fuzzy match. A fuzzy match ignores changes to
+ variable names.
+sources: An array of objects describing the files to scan.
+
+Each object in the sources array contains:
+
+name: The name of the file set. This name can be provided as an
+ argument on the command line to limit scanning to this fileset.
+def: 'true' if this is a default fileset. All default filesets will be
+ scanned when dupfind is executed with no arguments.
+root: The root directory for the fileset.
+directories: An array of strings. These are subdirectories under the root to
+ scan. If this member isn't present, all subdirectories are
+ scanned.
+include: Files to include. This is a DOS style regular expression - '.'
+ means '.', '*' means '.*' and '?' means '.'. The expression is
+ matched against the full path to each regular file (not
+ directories), and it has to match for the file to be scanned.
+exclude: Files and directories to exclude. This is also a DOS style
+ regular expression. Matching files are excluded. Matching
+ directories aren't scanned at all (meaning they're not recursed
+ into, either.)
+
+-Steve Francis
+sfrancisx@yahoo.com
+
363 Token.java
@@ -0,0 +1,363 @@
+/*
+Copyright (c) 2012, Yahoo! Inc. All rights reserved.
+
+Redistribution and use of this software in source and binary forms,
+with or without modification, are permitted provided that the following
+conditions are met:
+
+* Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+* Neither the name of Yahoo! Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior
+ written permission of Yahoo! Inc.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+public class Token
+{
+ public int type; // token type
+ public String src; // the token's string in the souce
+
+ public String value; // The token's value (for STRING & REGEXP tokens)
+ public int intValue; // The token's integer value (for NUMBER tokens)
+
+ public int start; // Index of 1st character in the source
+ public int end; // Index of 1st character after the token in the source
+ public int length; // end - start
+
+ public int lineNum; // Line number of the token
+ public int charOnLine; // Column number of token start
+ public int lineStart; // Index of the start of the line containing the token
+
+ Token(int type, String string, int offset, int lineNum, int charOnLine, int lineStart)
+ {
+ this.type = type;
+ this.src = string;
+
+ this.start = offset - 1;
+ this.length = string.length();
+ this.end = this.start + this.length;
+
+ if (this.type == STRING || this.type == REGEXP)
+ {
+ string = string.substring(1, string.length()-1);
+ if (this.type == STRING)
+ {
+ // TODO: Handle other escaped characters
+ string = string.replace("\\\'", "'");
+ string = string.replace("\\\"", "\"");
+ string = string.replace("\\\\", "\\");
+ }
+
+ this.value = string;
+ }
+ else
+ this.value = this.src;
+
+ this.value = this.value.intern();
+
+ if (this.type == NUMBER)
+ {
+ try
+ {
+ this.intValue = (new java.lang.Integer(string)).intValue();
+ }
+ catch (Exception e)
+ {
+ }
+ }
+
+ this.lineNum = lineNum;
+ this.charOnLine = charOnLine;
+ this.lineStart = lineStart;
+ }
+
+// public static final int
+// CONTEXTUAL = 1,
+// RESERVED = 2,
+// LITERAL = 3;
+
+// public Token makeIdentifier(int reserved)
+// {
+// switch (type)
+// {
+// //case BREAK: case CASE: case CAST: case CATCH: case CLASS: case CONST:
+// //case CONTINUE: case DEBUGGER: case DEFAULT: case DELETE: case DO:
+// //case DYNAMIC: case ELSE: case FALSE: case FINAL: case FINALLY: case FOR:
+// //case FUNCTION: case IF: case IN: case INSTANCEOF: case INTERFACE:
+// //case IS: case LET: case LIKE: case NAMESPACE: case NATIVE: case NEW:
+// //case NULL: case OVERRIDE: case RETURN: case STATIC: case SUPER:
+// //case SWITCH: case THIS: case THROW: case TRUE: case TRY: case TYPE:
+// //case TYPEOF: case USE: case VAR: case VOID: case WHILE: case WITH:
+// //case YIELD: case PROTO:
+// case BREAK: case CASE: case CATCH:
+// case CONTINUE: case DEBUGGER: case DEFAULT: case DELETE: case DO:
+// case ELSE: case FALSE: case FINALLY: case FOR:
+// case FUNCTION: case IF: case IN: case INSTANCEOF:
+// case NEW:
+// case NULL: case RETURN:
+// case SWITCH: case THIS: case THROW: case TRUE: case TRY:
+// case TYPEOF: case VAR: case WHILE: case WITH:
+// if (reserved != RESERVED)
+// return null;
+
+//// case NAME: case EACH: case EXTENDS: case GENERATOR: case GET:
+//// case IMPLEMENTS: case SET: case STANDARD: case STRICT: case UNDEFINED:
+// case NAME:
+// case UNDEFINED:
+// return new Token(NAME, value_, offset, lineNum, charOnLine, lineStart);
+
+// case STRING:
+// if (reserved == LITERAL)
+// return new Token(NAME, value_, offset, lineNum, charOnLine, lineStart);
+
+// default:
+// return null;
+// }
+// }
+
+// public boolean isCompoundAssignment()
+// {
+// if (type == MULASSIGN || type == DIVASSIGN || type == MODASSIGN
+// || type == ADDASSIGN || type == SUBASSIGN || type == LSHASSIGN
+// || type == URSHASSIGN || type == RSHASSIGN || type == ANDASSIGN
+// || type == XORASSIGN || type == ORASSIGN || type == LOGANDASSIGN
+// || type == LOGORASSIGN)
+// return true;
+
+// return false;
+// }
+
+// public static String getTokenString(int type)
+// {
+// switch (type)
+// {
+// case SEMI:
+// return ";";
+// case OR:
+// return "||";
+// case AND:
+// return "&&";
+// case RP:
+// return ")";
+
+// default:
+// return "(unimplemented for token " + type + ")";
+// }
+// }
+
+ /*
+ *
+ */
+ public static Token stringToKeyword(String name, int offset, int line, int charOnLine, int lineStart)
+ {
+ Integer result = (Integer) keywords.get(name);
+ if (result == null)
+ return null;
+
+ return new Token(result.intValue(), name, offset, line, charOnLine, lineStart);
+ }
+
+ public static final int
+ NAME = 1,
+ NUMBER = 2,
+ STRING = 3,
+
+ FIRST_KEYWORD = 20,
+
+ BREAK = 20,
+ CASE = 21,
+ CONTINUE = 22,
+ DEFAULT = 23,
+ DELETE = 24,
+ DO = 25,
+ ELSE = 26,
+ FALSE = 28,
+ FOR = 29,
+ FUNCTION = 30,
+ IF = 31,
+ IN = 32,
+ NEW = 33,
+ NULL = 34,
+ RETURN = 35,
+ SWITCH = 36,
+ THIS = 37,
+ TRUE = 38,
+ TYPEOF = 39,
+ VAR = 40,
+ VOID = 41,
+ WHILE = 42,
+ WITH = 43,
+
+ FIRST_JS2_KEYWORD = 50,
+
+// CAST = 50,
+ CATCH = 51,
+// CLASS = 52,
+// CONST = 53,
+ DEBUGGER = 54,
+// DYNAMIC = 55,
+// FINAL = 56,
+ FINALLY = 57,
+ INSTANCEOF = 58,
+// INTERFACE = 59,
+// IS = 60,
+// LET = 61,
+// LIKE = 62,
+// NAMESPACE = 63,
+// NATIVE = 64,
+// OVERRIDE = 65,
+// STATIC = 66,
+// SUPER = 67,
+ THROW = 68,
+ TRY = 69,
+// TYPE = 70,
+// USE = 71,
+// YIELD = 72,
+// PROTO = 73,
+
+ // JS2 contextual keywords
+// EACH = 74,
+// EXTENDS = 75,
+// GENERATOR = 76,
+// GET = 77,
+// IMPLEMENTS = 78,
+// SET = 79,
+// STANDARD = 80,
+// STRICT = 81,
+ UNDEFINED = 82,
+
+ LAST_KEYWORD = 99,
+
+ SEMI = 107, // ;
+ LB = 108, // [
+ RB = 109, // ]
+ LC = 110, // {
+ RC = 111, // }
+ LP = 112, // (
+ RP = 113, // )
+ COMMA = 114, // ,
+ HOOK = 115, // ?
+ COLON = 116, // :
+ DOT = 117, // .
+ OR = 118, // ||
+ ORASSIGN = 119, // ||=
+ BITOR = 120, // |
+ XORASSIGN = 121,
+ BITXOR = 122,
+ AND = 123,
+ ANDASSIGN = 124,
+ BITAND = 125,
+ SHEQ = 126, // ===
+ EQ = 127, // ==
+ ASSIGN = 128, // =
+ SHNE = 129, // !==
+ NE = 130, // !=
+ NOT = 131,
+ LSHASSIGN = 132,
+ LSH = 133, // <<
+ LE = 134, // <=
+ LT = 135, // <
+ URSHASSIGN = 136,
+ URSH = 137, // >>>
+ RSHASSIGN = 138,
+ RSH = 139, // >>
+ GE = 140, // >=
+ GT = 141, // >
+ MULASSIGN = 142,
+ MUL = 143, // *
+ REGEXP = 144,
+ DIVASSIGN = 145,
+ DIV = 146, // /
+ MODASSIGN = 147,
+ MOD = 148, // %
+ BITNOT = 149,
+ ADDASSIGN = 150, // +=
+ INC = 151, // ++
+ ADD = 152, // +
+ SUBASSIGN = 153, // -=
+ DEC = 154, // --
+ SUB = 155, // -
+
+ // Extensions
+ BS = 156, // \
+ COMMENT = 157;
+
+ private static java.util.Hashtable keywords;
+ static
+ {
+ String[] strings =
+ {
+ // JS1 keywords
+ "break", "case", "continue", "default", "delete",
+ "do", "else", "false", "for", "function",
+ "if", "in", "new", "null", "return", "switch",
+ "this", "true", "typeof", "var", "void", "while",
+ "with",
+
+ // JS2 keywords
+ //"cast", "catch", "class", "const", "debugger", "dynamic",
+ //"final", "finally", "instanceof", "interface", "is", "let",
+ //"like", "namespace", "native", "override", "static", "super",
+ //"throw", "try", "type", "use", "yield", "__proto__",
+
+ "catch", "debugger",
+ "finally", "instanceof",
+ "throw", "try",
+
+ // JS2 contextual keywords
+ //"each", "extends", "generator", "get", "implements", "set",
+ //"standard", "strict", "undefined"
+ "undefined"
+ };
+
+ int[] values =
+ {
+ // JS1
+ BREAK, CASE, CONTINUE, DEFAULT, DELETE,
+ DO, ELSE, FALSE, FOR, FUNCTION,
+ IF, IN, NEW, NULL, RETURN, SWITCH,
+ THIS, TRUE, TYPEOF, VAR, VOID, WHILE,
+ WITH,
+
+ // JS2
+ //CAST, CATCH, CLASS, CONST, DEBUGGER, DYNAMIC,
+ //FINAL, FINALLY, INSTANCEOF, INTERFACE, IS, LET,
+ //LIKE, NAMESPACE, NATIVE, OVERRIDE, STATIC, SUPER,
+ //THROW, TRY, TYPE, USE, YIELD, PROTO,
+ CATCH, DEBUGGER,
+ FINALLY, INSTANCEOF,
+ THROW, TRY,
+
+ // JS2 contextual keywords
+ //EACH, EXTENDS, GENERATOR, GET, IMPLEMENTS, SET,
+ //STANDARD, STRICT, UNDEFINED
+ UNDEFINED
+ };
+
+ keywords = new java.util.Hashtable(strings.length);
+ for (int i=0; i < strings.length; i++)
+ keywords.put(strings[i], new Integer(values[i]));
+ }
+
+}
+
650 Tokenizer.java
@@ -0,0 +1,650 @@
+/*
+Copyright (c) 2012, Yahoo! Inc. All rights reserved.
+
+Redistribution and use of this software in source and binary forms,
+with or without modification, are permitted provided that the following
+conditions are met:
+
+* Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+* Neither the name of Yahoo! Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior
+ written permission of Yahoo! Inc.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+public class Tokenizer
+{
+ public int getOffset() { return in.offset; }
+
+ void init(char[] in, boolean extended, boolean getRegex)
+ {
+ this.in = new TokenizeReader(in);
+ this.extended = extended;
+ this.getRegex = getRegex;
+
+ if (getRegex)
+ flags = TSF_REGEXP;
+ }
+
+ public Tokenizer(char[] in, boolean extended, boolean getRegex)
+ {
+ init(in, extended, getRegex);
+ }
+
+ public Tokenizer(char[] in)
+ {
+ init(in, false, false);
+ }
+
+ public Tokenizer(String in)
+ {
+ init(in.toCharArray(), false, false);
+ }
+
+ public Tokenizer(String in, boolean extended, boolean getRegex)
+ {
+ init(in.toCharArray(), extended, getRegex);
+ }
+
+ public int TSF_REGEXP = 1;
+ public int flags = 0;//TSF_REGEXP;
+ public boolean extended;
+ public boolean getRegex;
+
+ Token lastToken;
+
+ public Token peekToken()
+ {
+ if (lastToken == null)
+ lastToken = getToken();
+
+ return lastToken;
+ }
+
+ public Token nextToken()
+ {
+ Token next = peekToken();
+ lastToken = null;
+ return next;
+ }
+
+ /*
+ *
+ */
+ public Token getToken()
+ {
+ Token t = internalGetToken();
+ if (t == null)
+ return t;
+
+ if (!getRegex &&
+ ((t.type == Token.NAME)
+ || (t.type == Token.NUMBER)
+ || (t.type == Token.RB)
+ || (t.type == Token.RP)
+ || (t.type == Token.DOT)))
+ flags = 0;
+ else
+ flags = TSF_REGEXP;
+
+ return t;
+ }
+
+ private int tokenStart;
+ private int charOnLineStart;
+
+ private Token internalGetToken()
+ {
+ int c;
+
+ do
+ {
+ in.mark(0);
+ c = in.read();
+ tokenStart = in.offset;
+ charOnLineStart = in.charOnLine;
+ } while (isJSSpace(c) || (c == '\n'));
+
+ if (c == EOF_CHAR)
+ return null;
+
+ int lineNum = in.line;
+
+ // Identifier?
+ if (Character.isJavaIdentifierStart((char)c))
+ {
+ while (Character.isJavaIdentifierPart((char)in.read()))
+ ;
+ in.unread();
+
+ String str = in.getString(0);
+ Token result;
+ if ((result = Token.stringToKeyword(str, tokenStart, in.line, charOnLineStart, in.lineStart)) != null)
+ return result;
+ return createToken(Token.NAME, str, lineNum);
+ }
+
+ // Number?
+ if (isDigit(c) || (c == '.' && isDigit(in.peek())))
+ {
+ int base = 10;
+
+ if (c == '0')
+ {
+ c = in.read();
+ if (c == 'x' || c == 'X')
+ {
+ c = in.read();
+ base = 16;
+ }
+ }
+
+ while (isXDigit(c))
+ {
+ if (base < 16 && isAlpha(c))
+ break;
+
+ c = in.read();
+ }
+
+ if (base == 10 && (c == '.' || c == 'e' || c == 'E'))
+ {
+ if (c == '.')
+ {
+ do
+ {
+ c = in.read();
+ } while (isDigit(c));
+ }
+
+ if (c == 'e' || c == 'E')
+ {
+ c = in.read();
+ if (c == '+' || c == '-')
+ {
+ c = in.read();
+ }
+
+ if (!isDigit(c))
+ return createToken(ERROR, "Missing exponent: " + in.getString(0), lineNum);
+
+ do
+ {
+ c = in.read();
+ } while (isDigit(c));
+ }
+ }
+
+ in.unread();
+ return createToken(Token.NUMBER, in.getString(0), lineNum);
+ }
+
+ // String?
+ if (c == '"' || c == '\'')
+ {
+ int q = c;
+
+ c = in.read();
+
+ while (c != q)
+ {
+ if (c == '\n' || c == EOF_CHAR)
+ {
+ in.unread();
+ return createToken(ERROR, "Unterminated string literal: " + in.getString(0), lineNum);
+ }
+
+ if (c == '\\')
+ in.read();
+
+ c = in.read();
+ }
+
+ return createToken(Token.STRING, in.getString(0), lineNum);
+ }
+
+ int t = getTokenType(c);
+ switch (t)
+ {
+ case RETRY:
+ return getToken();
+
+ default:
+ return createToken(t, in.getString(0), lineNum);
+ }
+ }
+
+ private Token createToken(int t, String v, int lineNum)
+ {
+ return new Token(t, v, tokenStart, lineNum, charOnLineStart, in.lineStart);
+ }
+
+ private int getTokenType(int c)
+ {
+ switch (c)
+ {
+ case ';': return Token.SEMI;
+ case '[': return Token.LB;
+ case ']': return Token.RB;
+ case '{': return Token.LC;
+ case '}': return Token.RC;
+ case '(': return Token.LP;
+ case ')': return Token.RP;
+ case ',': return Token.COMMA;
+ case '?': return Token.HOOK;
+
+
+ case '.':
+ return Token.DOT;
+
+ case ':':
+ return Token.COLON;
+
+ case '|':
+ if (in.match('|'))
+ return Token.OR;
+ if (in.match('='))
+ return Token.ORASSIGN;
+ return Token.BITOR;
+
+ case '^':
+ if (in.match('='))
+ return Token.XORASSIGN;
+ return Token.BITXOR;
+
+ case '&':
+ if (in.match('&'))
+ return Token.AND;
+ if (in.match('='))
+ return Token.ANDASSIGN;
+ return Token.BITAND;
+
+ case '=':
+ if (in.match('='))
+ {
+ if (in.match('='))
+ return Token.SHEQ;
+ return Token.EQ;
+ }
+
+ return Token.ASSIGN;
+
+ case '!':
+ if (in.match('='))
+ {
+ if (in.match('='))
+ return Token.SHNE;
+ return Token.NE;
+ }
+
+ return Token.NOT;
+
+ case '<':
+ if (in.match('<'))
+ {
+ if (in.match('='))
+ return Token.LSHASSIGN;
+
+ return Token.LSH;
+ }
+
+ if (in.match('='))
+ return Token.LE;
+
+ return Token.LT;
+
+ case '>':
+ if (in.match('>'))
+ {
+ if (in.match('>'))
+ {
+ if (in.match('='))
+ return Token.URSHASSIGN;
+
+ return Token.URSH;
+ }
+
+ if (in.match('='))
+ return Token.RSHASSIGN;
+
+ return Token.RSH;
+ }
+
+ if (in.match('='))
+ return Token.GE;
+
+ return Token.GT;
+
+ case '*':
+ if (in.match('='))
+ return Token.MULASSIGN;
+
+ return Token.MUL;
+
+ case '/':
+ in.mark(1);
+
+ // is it a // comment?
+ if (in.match('/'))
+ {
+ /* skip to end of line */
+ while ((c = in.read()) != EOF_CHAR && c != '\n')
+ ;
+
+ if (c != EOF_CHAR)
+ in.unread();
+
+ if (extended)
+ return Token.COMMENT;//System.out.println("Comment: " + in.getString(0));
+ return RETRY;
+ }
+
+ // Is it a /* comment?
+ if (in.match('*'))
+ {
+ while ((c = in.read()) != -1 && !(c == '*' && in.match('/')))
+ {
+ if (c == '/' && in.match('*'))
+ {
+ if (in.match('/'))
+ {
+ if (extended)
+ return Token.COMMENT;//System.out.println("Comment: " + in.getString(0));
+ return RETRY;
+ }
+
+ return ERR_NESTED_COMMENT;
+ }
+ }
+
+ if (c == EOF_CHAR)
+ return ERR_UNTERMINATED_COMMENT;
+
+ if (extended)
+ return Token.COMMENT;//System.out.println("Comment: " + in.getString(0));
+ return RETRY;
+ }
+
+ // is it a regexp?
+ if ((flags & TSF_REGEXP) != 0)
+ {
+ while ((c = in.read()) != '/')
+ {
+ if (c == '\n' || c == EOF_CHAR)
+ {
+ in.unread();
+ return ERR_UNTERMINATED_REGEXP;
+ }
+
+ if (c == '\\')
+ in.read();
+ }
+
+ while (true)
+ {
+ if (!in.match('g') && !in.match('i') && !in.match('m'))
+ break;
+ }
+
+ if (isAlpha(in.peek()))
+ return ERR_INVALID_REGEXP_FLAG;
+
+ return Token.REGEXP;
+ }
+
+ if (in.match('='))
+ return Token.DIVASSIGN;
+
+ return Token.DIV;
+
+ case '%':
+ if (in.match('='))
+ return Token.MODASSIGN;
+ return Token.MOD;
+
+ case '~':
+ return Token.BITNOT;
+
+ case '+':
+ if (in.match('='))
+ return Token.ADDASSIGN;
+ if (in.match('+'))
+ return Token.INC;
+ return Token.ADD;
+
+ case '-':
+ if (in.match('='))
+ return Token.SUBASSIGN;
+ if (in.match('-'))
+ return Token.DEC;
+ return Token.SUB;
+
+ case '\\':
+ if (extended)
+ return Token.BS;
+ break;
+ }
+
+ return ERR_UNKNOWN_TOKEN;
+ }
+
+
+ /*
+ *
+ */
+ private static boolean isJSIdentifier(String s)
+ {
+ int length = s.length();
+
+ if (length == 0 || !Character.isJavaIdentifierStart(s.charAt(0)))
+ return false;
+
+ for (int i=1; i<length; i++)
+ {
+ char c = s.charAt(i);
+ if (!Character.isJavaIdentifierPart(c))
+ {
+ if (c == '\\')
+ {
+ if (! ((i + 5) < length)
+ && (s.charAt(i + 1) == 'u')
+ && isXDigit(s.charAt(i + 2))
+ && isXDigit(s.charAt(i + 3))
+ && isXDigit(s.charAt(i + 4))
+ && isXDigit(s.charAt(i + 5)))
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ private static boolean isAlpha(int c)
+ {
+ return ((c >= 'a' && c <= 'z')
+ || (c >= 'A' && c <= 'Z'));
+ }
+
+ static boolean isDigit(int c)
+ {
+ return (c >= '0' && c <= '9');
+ }
+
+ static boolean isXDigit(int c)
+ {
+ return ((c >= '0' && c <= '9')
+ || (c >= 'a' && c <= 'f')
+ || (c >= 'A' && c <= 'F'));
+ }
+
+ private static boolean isJSSpace(int c)
+ {
+ return (c == '\u0020' || c == '\u0009'
+ || c == '\u000C' || c == '\u000B'
+ || c == '\u00A0'
+ || Character.getType((char)c) == Character.SPACE_SEPARATOR);
+ }
+
+ private static boolean isJSLineTerminator(int c)
+ {
+ return (c == '\n' || c == '\r'
+ || c == 0x2028 || c == 0x2029);
+ }
+
+ private TokenizeReader in;
+ private final static int EOF_CHAR = -1;
+
+ private static final int
+ ERROR = -1,
+ EOF = 0,
+ RETRY = 500,
+ ERR_UNTERM_COMMENT = 501,
+ ERR_NESTED_COMMENT = 502,
+ ERR_UNTERMINATED_COMMENT = 503,
+ ERR_UNTERMINATED_REGEXP = 504,
+ ERR_INVALID_REGEXP_FLAG = 505,
+ ERR_UNKNOWN_TOKEN = 506;
+}
+
+/*
+ *
+ */
+class TokenizeReader
+{
+ public TokenizeReader(char[] in)
+ {
+ this.in = in;
+ offset = 0;
+
+ line = 1;
+ charOnLine = 0;
+ lineStart = 0;
+ }
+
+ // Read the next character. For \r\n, return just \n.
+ public int read()
+ {
+ // mark 19 is used for unread
+ mark(19);
+
+ if (offset == in.length)
+ return -1;
+
+ if (in[offset] == '\r')
+ {
+ if ((offset + 1) < in.length)
+ {
+ if (in[offset + 1] == '\n')
+ offset++;
+ }
+ }
+
+ if (in[offset] == '\n')
+ {
+ line++;
+ charOnLine = 0;
+
+ offset++;
+ lineStart = offset;
+
+ return '\n';
+ }
+
+ if (in[offset] == '\t')
+ {
+ charOnLine = charOnLine/4;
+ charOnLine++;
+ charOnLine = charOnLine*4;
+// charOnLine += 4;
+ }
+ else
+ charOnLine++;
+
+ return in[offset++];
+ }
+
+ public void unread()
+ {
+ reset(19);
+ }
+
+ public boolean match(char c)
+ {
+ if (offset == in.length)
+ return false;
+
+ if (in[offset] == c)
+ {
+ offset++;
+ charOnLine++;
+ return true;
+ }
+
+ return false;
+ }
+
+ // Peek at the next character, without changing anything
+ public int peek()
+ {
+ if (offset == in.length)
+ return -1;
+ return in[offset];
+ }
+
+ public void mark(int num)
+ {
+ markOffset[num] = offset;
+ markLine[num] = line;
+ markCharOnLine[num] = charOnLine;
+ markLineStart[num] = lineStart;
+ }
+
+ public void reset(int num)
+ {
+ offset = markOffset[num];
+ line = markLine[num];
+ charOnLine = markCharOnLine[num];
+ lineStart = markLineStart[num];
+ }
+
+ public String getString(int num)
+ {
+ return new String(in, markOffset[num], offset - markOffset[num]);
+ }
+
+ // The buffer we're reading from
+ public char[] in;
+ public int offset;
+// private int prevOffset;
+
+ // Keep track of where we are for error reporting
+ public int line; // line number
+ public int charOnLine;
+ public int lineStart; // start of current line
+
+ // mark offsets. I only use 2, but it's cheap to allocate a bunch
+ private int[] markOffset = new int[20];
+ private int[] markLine = new int[20];
+ private int[] markCharOnLine = new int[20];
+ private int[] markLineStart = new int[20];
+}
317 Utils.java
@@ -0,0 +1,317 @@
+/*
+Copyright (c) 2012, Yahoo! Inc. All rights reserved.
+
+Redistribution and use of this software in source and binary forms,
+with or without modification, are permitted provided that the following
+conditions are met:
+
+* Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the
+ following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+* Neither the name of Yahoo! Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior
+ written permission of Yahoo! Inc.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+import java.io.*;
+import java.util.*;
+
+class Utils
+{
+ ///////////////////////////////////////////////////////////////////////////
+ public static String getPath(String root, String path)
+ throws IOException
+ {
+ File f;
+
+ if (path == null)
+ return null;
+
+ if (path.indexOf('/') == 0)
+ f = new File(path);
+ else if (root.lastIndexOf('/') != root.length()-1)
+ f = new File(root + "/" + path);
+ else
+ f = new File(root + path);
+
+ //debug(f.getCanonicalPath());
+
+ return f.getCanonicalPath();
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static int parseObject(Token[] tokens, int first, int last, Hashtable obj)
+ {
+ Token t = tokens[first];
+ if (t.type != Token.LC)
+ error("error");
+
+ int l2 = findCloser(tokens, first);
+ if (l2 > last)
+ error("Expected } before line %d", tokens[last-1].lineNum);
+ last = l2;
+
+ first++;
+ t = tokens[first++];
+
+ while (first < last)
+ {
+ if (t.type != Token.NAME && t.type != Token.STRING)
+ error("Expected name on line %d", t.lineNum);
+
+ String name = t.value;
+
+ t = tokens[first++];
+
+ if (t.type != Token.COLON)
+ error("Expected ':' on line %d", t.lineNum);
+
+ t = tokens[first];
+ switch (t.type)
+ {
+ case Token.STRING:
+ case Token.TRUE:
+ case Token.FALSE:
+ case Token.NUMBER:
+ first++;
+ obj.put(name, t.value);
+ break;
+
+ case Token.LB:
+ ArrayList a = new ArrayList();
+ first = parseArray(tokens, first, last, a);
+ obj.put(name, a);
+ break;
+
+ case Token.LC:
+ Hashtable o = new Hashtable();
+ first = parseObject(tokens, first, last, o);
+ obj.put(name, o);
+ break;
+
+ case Token.FUNCTION:
+ while (t.type != Token.LC)
+ t = tokens[++first];
+ first = findCloser(tokens, first);
+
+ debug("found function in object on line " + t.lineNum);
+ break;
+ }
+
+ t = tokens[first++];
+ if (t.type == Token.RC)
+ {
+ if (first != last)
+ error("Found unexpected } on line %d", t.lineNum);
+ return first;
+ }
+
+ if (t.type != Token.COMMA)
+ error("Expected comma on line %d", t.lineNum);
+
+ t = tokens[first++];
+ }
+
+ return first;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ public static int parseArray(Token[] tokens, int first, int last, ArrayList arr)
+ {
+ Token t = tokens[first];
+ if (t.type != Token.LB)
+ error("error");
+
+ int l2 = findCloser(tokens, first);
+ if (l2 > last)
+ error("Expected ] before line %d", tokens[last-1].lineNum);
+ last = l2;
+ first++;
+
+ while (first < last)
+ {
+ t = tokens[first];
+ switch (t.type)
+ {
+ case Token.STRING:
+ case Token.TRUE:
+ case Token.FALSE:
+ first++;
+ arr.add(t.value);
+ break;
+
+ case Token.LB:
+ ArrayList a = new ArrayList();
+ first = parseArray(tokens, first, last, a);
+ arr.add(a);
+ break;
+
+ case Token.LC:
+ Hashtable o = new Hashtable();
+ first = parseObject(tokens, first, last, o);
+ arr.add(o);
+ break;
+ }
+
+ t = tokens[first++];
+ if (t.type == Token.RB)
+ {
+ if (first != last)
+ error("Found unexpected ] on line %d", t.lineNum);
+ return first;
+ }
+
+ if (t.type != Token.COMMA)
+ error("Expected comma on line %d", t.lineNum);
+ }
+
+ error("] not found");
+ return 0;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ //parseValue(Token[] tokens, int first, int last, ArrayList arr)
+ //{
+ //}
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static int findCloser(Token[] tokens, int first)
+ {
+ int nesting = 1;
+ int searchFor = 0;
+ Token opener = tokens[first++];
+
+ if (opener.type == Token.LP)
+ searchFor = Token.RP;
+ else if (opener.type == Token.LC)
+ searchFor = Token.RC;
+ else if (opener.type == Token.LB)
+ searchFor = Token.RB;
+ else
+ return first;
+
+ while (first < tokens.length)
+ {
+ Token t = tokens[first++];
+ if (t.type == searchFor)
+ nesting--;
+ if (t.type == opener.type)
+ nesting++;
+
+ if (nesting == 0)
+ return first;
+ }
+
+ error("Unmatched %s on line %d", opener.value, opener.lineNum);
+ return 0;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static Token[] tokenize(String src)
+ {
+ Tokenizer tz = new Tokenizer(src);
+
+ Token t;
+
+ ArrayList tokens = new ArrayList(10000);
+ while ((t = tz.getToken()) != null)
+ tokens.add(t);
+
+ Token[] tokenArray = new Token[tokens.size()];
+ for (int i = 0; i < tokens.size(); i++)
+ tokenArray[i] = (Token)tokens.get(i);
+
+ return tokenArray;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static Token[] getTokens(String filename)
+ throws Throwable
+ {
+ int len = (int)(new File(filename)).length();
+ if (len == 0)
+ return null;
+
+ char[] src = new char[len];
+ FileReader fr = new FileReader(filename);
+ len = fr.read(src, 0, len);
+ fr.close();
+
+ // The file length (from file.length()) may not match the number
+ // of characters read - the file length is in bytes, but the file
+ // can use UTF-8 encoding, meaning there can be fewer characters
+ // than bytes in the file. This line re-allocates the src
+ // array so it will only include file data, with no left over space.
+ return tokenize(new String(src, 0, len));
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static void message(String name, Object... args)
+ {
+ String msg = (String)messages.get(name);
+ if (msg == null)
+ msg = name;
+
+ System.out.println(String.format(msg, args));
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static void debug(String name, Object... args)
+ {
+ message(name, args);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static void warn(String name, Object... args)
+ {
+ message(name, args);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ public static void error(String name, Object... args)
+ {
+ message(name, args);
+ System.exit(0);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ private static Hashtable messages;
+
+ static
+ {
+ String[] msgs =
+ {
+ "Read %d tokens",
+ "Unable to read config file 'compactor.cfg'"
+ };
+
+ String[] names =
+ {
+ "configTokens",
+ "noConfigFile"
+ };
+
+ messages = new Hashtable(msgs.length);
+ for (int i = 0; i < msgs.length; i++)
+ messages.put(names[i], msgs[i]);
+ }
+}
4 dupfind
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+java -Xmx512m -jar /home/y/bin/dupfind.jar "$@"
+
73 dupfind.cfg
@@ -0,0 +1,73 @@
+{
+ min: 30,
+ max: 500,
+ increment: 10,
+ fuzzy: true,
+
+ sources:
+ [
+ {
+ // Name of this collection of sources
+ name: "neo",
+
+ // Whether or not it's part of the default set
+ def: true,
+
+ // Source directory
+ root: "/home/sfrancis/dev/yahoo/ymail/neo/src",
+
+ // List of directories under the root.
+ directories:
+ [
+ "common",
+ "comms",
+ "contacts",
+ "galaxy",
+ "launch",
+ "mail",
+ "mods",
+ "om",
+ "templates",
+ "intl"
+ ],
+
+ // Files to include. Regexes accepted
+ include:
+ [
+ //"*.mu",
+ //"*.inc",
+ "*.js"
+ ],
+
+ // Files to exclude. Regexes accepted
+ exclude:
+ [
+ "*/.svn",
+ "*/src/launch/mock*",
+ "*/mail/ui/*",
+ "*/lang_*.js",
+ "*/mods/neo.js"
+ ]
+ },
+ {
+ name: "yui",
+
+ root: "/home/y/share/htdocs/yui3/build",
+
+ include:
+ [
+ "*.js"
+ ],
+
+ exclude:
+ [
+ "*/.svn",
+ "*simpleyui.js",
+ "*-[^/]*.js",
+ "*selector.js",
+ "*yui.js",
+ "*datatype*"
+ ]
+ }
+ ]
+}
2 dupfind.inf
@@ -0,0 +1,2 @@
+Main-Class: DupFind
+
BIN dupfind.jar
Binary file not shown.

0 comments on commit ddeac87

Please sign in to comment.
Something went wrong with that request. Please try again.