Permalink
Browse files

First check-in of the new easier to modify HTMLPageParser

  • Loading branch information...
1 parent 4d1f8a4 commit b1af2a227c29b22095d0ca90d816f54c11d610a4 @joewalnes joewalnes committed Sep 20, 2004
View
@@ -45,3 +45,6 @@ blank.dir = ${etc.dir}/blank
Name = OpenSymphony SiteMesh
name = sitemesh
version = 2.1
+
+# Path to BYacc/J parser generator binary. Available from : http://byaccj.sourceforge.net/
+yacc.exe = ${lib.dir}/yacc
View
@@ -10,7 +10,49 @@
</fileset>
</path>
- <target name="compile">
+ <target name="pre-compile">
+
+ <!-- Determine if lexer needs regenerating -->
+ <uptodate
+ property="lexer.uptodate"
+ srcfile="${java.dir}/com/opensymphony/module/sitemesh/parser/html/lexer.flex"
+ targetfile="${java.dir}/com/opensymphony/module/sitemesh/parser/html/Lexer.java"/>
+ <antcall target="jflex"/>
+
+ <!-- Determine if parser needs regenerating -->
+ <uptodate
+ property="parser.uptodate"
+ srcfile="${java.dir}/com/opensymphony/module/sitemesh/parser/html/parser.yacc"
+ targetfile="${java.dir}/com/opensymphony/module/sitemesh/parser/html/Parser.java"/>
+ <antcall target="yacc"/>
+
+ </target>
+
+ <target name="jflex" unless="lexer.uptodate">
+ <echo message="Generating HTML lexer using JFlex"/>
+ <java classpath="${lib.dir}/jflex.jar" classname="JFlex.Main" fork="yes">
+ <arg value="-d"/>
+ <arg value="${java.dir}/com/opensymphony/module/sitemesh/parser/html"/>
+ <arg value="${java.dir}/com/opensymphony/module/sitemesh/parser/html/lexer.flex"/>
+ </java>
+ <delete file="${java.dir}/com/opensymphony/module/sitemesh/parser/html/Lexer.java~"/>
+ </target>
+
+ <target name="yacc" unless="parser.uptodate">
+ <echo message="Generating HTML parser using BYacc/J"/>
+ <exec executable="${yacc.exe}" dir="${java.dir}/com/opensymphony/module/sitemesh/parser/html" failonerror="true">
+ <arg value="-Jnorun"/>
+ <arg value="-Jnoconstruct"/>
+ <arg value="-Jclass=Parser"/>
+ <arg value="-Jextends=Lexer"/>
+ <arg value="-Jsemantic=String"/>
+ <arg value="-Jsemantic=Value"/>
+ <arg value="-Jpackage=com.opensymphony.module.sitemesh.parser.html"/>
+ <arg value="parser.yacc"/>
+ </exec>
+ </target>
+
+ <target name="compile" depends="pre-compile">
<mkdir dir="${build.dir}/classes"/>
<javac destdir="${build.dir}/classes"
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
@@ -30,7 +30,7 @@
* add all the required information.</p>
*
* @author <a href="joe@truemesh.com">Joe Walnes</a>
- * @version $Revision: 1.3 $
+ * @version $Revision: 1.4 $
*
* @see com.opensymphony.module.sitemesh.Page
*/
@@ -42,7 +42,7 @@
private Map properties = new HashMap();
/** Date of page contents. */
- char[] pageData = new char[0];
+ protected char[] pageData = new char[0];
/** RequestURI of original Page. */
private HttpServletRequest request;
@@ -0,0 +1,50 @@
+package com.opensymphony.module.sitemesh.parser.html;
+
+import com.opensymphony.module.sitemesh.parser.AbstractHTMLPage;
+
+import java.io.Writer;
+import java.io.StringWriter;
+import java.io.IOException;
+
+class HTMLPage extends AbstractHTMLPage {
+
+ private Writer head = new StringWriter();
+ private Writer body = new StringWriter();
+
+ public HTMLPage(char[] original) {
+ this.pageData = original;
+ }
+
+ public void appendToHead(String s) {
+ try {
+ head.write(s);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void appendToBody(String s) {
+ try {
+ body.write(s);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void writeHead(Writer out) throws IOException {
+ out.write(head.toString());
+ }
+
+ public void writeBody(Writer out) throws IOException {
+ out.write(body.toString());
+ }
+
+ public String getHead() {
+ return null;
+ }
+
+ public boolean isFrameSet() {
+ return false;
+ }
+
+}
@@ -0,0 +1,88 @@
+package com.opensymphony.module.sitemesh.parser.html;
+
+import com.opensymphony.module.sitemesh.Page;
+import com.opensymphony.module.sitemesh.PageParser;
+import com.opensymphony.module.sitemesh.parser.AbstractHTMLPage;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.io.StringWriter;
+
+public class HTMLPageParser implements PageParser {
+
+ public Page parse(char[] data) throws IOException {
+ final HTMLPage result = new HTMLPage(data);
+ result.addProperty("title", "");
+ HTMLTagTokenizer tokenizer = new HTMLTagTokenizer(data);
+ tokenizer.start(new TokenHandler() {
+
+ private boolean inTitle;
+ private boolean inHead;
+ private String contentBlockId;
+ private boolean titleWritten;
+ private boolean bodyWritten;
+
+ public void tag(Tag tag) {
+ String name = tag.getName().toLowerCase();
+ if (name.equals("title")) {
+ inTitle = tag.getType() == Tag.OPEN;
+ } else if (name.equals("head")) {
+ inHead = tag.getType() == Tag.OPEN;
+ } else if (name.equals("content")) {
+ if (tag.getType() == Tag.OPEN) {
+ contentBlockId = tag.getAttributeValue("tag");
+ } else {
+ contentBlockId = null;
+ }
+ } else if (name.equals("meta")) {
+ result.addProperty("meta." + tag.getAttributeValue("name"), tag.getAttributeValue("content"));
+ } else if (name.equals("body")) {
+ if (tag.getType() == Tag.OPEN || tag.getType() == Tag.EMPTY) {
+ for (int i = 0; i < tag.getAttributeCount(); i++) {
+ result.addProperty("body." + tag.getAttributeName(i), tag.getAttributeValue(i));
+ }
+ } else {
+ bodyWritten = true;
+ }
+ } else if (name.equals("html")) {
+ for (int i = 0; i < tag.getAttributeCount(); i++) {
+ result.addProperty(tag.getAttributeName(i), tag.getAttributeValue(i));
+ }
+ } else if (name.equals("parameter")) {
+ result.addProperty("page." + tag.getAttributeValue("name"), tag.getAttributeValue("value"));
+ } else {
+ //result.appendToBody(tag.getCompleteTag());
+ }
+
+ if (inHead && !name.equals("head") && !name.equals("title")) {
+ result.appendToHead(tag.getCompleteTag());
+ }
+ if (!inHead && !bodyWritten && !name.equals("body") && !name.equals("html")
+ && !name.equals("head") && !name.equals("title") && !name.equals("parameter") && !name.equals("content")) {
+ result.appendToBody(tag.getCompleteTag());
+ }
+ }
+
+ public void text(Text text) {
+ if (inTitle && !titleWritten) {
+ result.addProperty("title", text.getText());
+ titleWritten = true;
+ } else if (contentBlockId != null) {
+ result.addProperty("page." + contentBlockId, text.getText());
+ } else {
+ //result.appendToBody(text.getText());
+ }
+
+ if (inHead && !inTitle) {
+ result.appendToHead(text.getText());
+ }
+ if (!inHead && !inTitle && !bodyWritten && contentBlockId == null) {
+ result.appendToBody(text.getText());
+ }
+ }
+ });
+ return result;
+ }
+
+
+}
@@ -0,0 +1,117 @@
+package com.opensymphony.module.sitemesh.parser.html;
+
+import java.io.CharArrayReader;
+import java.util.ArrayList;
+import java.util.List;
+
+public class HTMLTagTokenizer implements Tag, Text {
+
+ private final char[] input;
+
+ private TokenHandler handler;
+
+ private int currentType;
+ private int currentStart;
+ private int currentEnd;
+ private String currentName;
+ private String currentText;
+ private List currentAttributes = new ArrayList();
+
+ public HTMLTagTokenizer(char[] input) {
+ this.input = input;
+ }
+
+ public HTMLTagTokenizer(String input) {
+ this(input.toCharArray());
+ }
+
+ public synchronized void start(TokenHandler handler) {
+ this.handler = handler;
+ Parser parser = new Parser(this, new CharArrayReader(input));
+ parser.yyparse();
+ }
+
+ public String getCompleteTag() {
+ return new String(input, currentStart, currentEnd - currentStart);
+ }
+
+ public String getName() {
+ return currentName;
+ }
+
+ public int getType() {
+ return currentType;
+ }
+
+ public String getText() {
+ return currentText;
+ }
+
+ public int getAttributeCount() {
+ return currentAttributes == null ? 0 : currentAttributes.size();
+ }
+
+ public String getAttributeName(int index) {
+ return ((Attribute) currentAttributes.get(index)).name;
+ }
+
+ public String getAttributeValue(int index) {
+ return ((Attribute) currentAttributes.get(index)).value;
+ }
+
+ public String getAttributeValue(String name) {
+ // todo: optimize
+ for (int i = 0; i < getAttributeCount(); i++) {
+ if (getAttributeName(i).equalsIgnoreCase(name)) {
+ return getAttributeValue(i);
+ }
+ }
+ return null;
+ }
+
+ public void parsedTag(int type, String name, int start, int end) {
+ this.currentType = type;
+ this.currentName = name;
+ this.currentStart = start;
+ this.currentEnd = end;
+ handler.tag((Tag)this);
+ this.currentAttributes = null;
+ this.currentName = null;
+ this.currentType = Tag.UNKNOWN;
+ this.currentStart = 0;
+ this.currentEnd = 0;
+ }
+
+ public void parsedText(String text) {
+ this.currentText = text;
+ handler.text((Text)this);
+ this.currentText = null;
+ }
+
+ public void parsedText(int start, int end) {
+ this.currentText = new String(input, start, end - start);
+ handler.text((Text)this);
+ this.currentText = null;
+ }
+
+ public void parsedAttribute(String name, String value, boolean quoted) {
+ // TODO: optimize this... most attributes are ignored, so only bother initializing a heavy Map when
+ // absolutely positively necessary.
+ if (currentAttributes == null) {
+ currentAttributes = new ArrayList();
+ }
+ Attribute attribute = new Attribute();
+ attribute.name = name;
+ if (quoted) {
+ attribute.value = value.substring(1, value.length() - 1);
+ } else {
+ attribute.value = value;
+ }
+ currentAttributes.add(attribute);
+ }
+
+ private static class Attribute {
+ String name;
+ String value;
+ }
+}
Oops, something went wrong.

0 comments on commit b1af2a2

Please sign in to comment.