update: 使用自行实现的元素解析方法，而不再使用 html.Parse

satori-protocol-go · Apr 12, 2024 · 63fa124 · 63fa124
1 parent 086f66e
commit 63fa124
Show file tree

Hide file tree

Showing 2 changed files with 226 additions and 2 deletions.
diff --git a/pkg/message/parser.go b/pkg/message/parser.go
@@ -1,7 +1,6 @@
 package message
 
 import (
-	"bytes"
 	"fmt"
 	"strings"
 
@@ -89,7 +88,7 @@ func parseHtmlChildrenNode(n *html.Node, callback func(e MessageElement)) error
 }
 
 func Parse(source string) ([]MessageElement, error) {
-	doc, _ := html.Parse(bytes.NewReader([]byte(source)))
+	doc := xhtmlParse(source)
 	var result []MessageElement
 	err := parseHtmlNode(doc, func(e MessageElement) {
 		if e != nil {

diff --git a/pkg/message/xhtml.go b/pkg/message/xhtml.go
@@ -0,0 +1,225 @@
+package message
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+var (
+	tagPat  = regexp.MustCompile(`(<!--[\s\S]*?-->)|(<(/?)([^!\s>/]*)([^>]*?)\s*(/?)>)`)
+	attrPat = regexp.MustCompile(`([^\s=]+)(?:="([^"]*)"|='([^']*)')?`)
+)
+
+func escape(text string, inLine bool) string {
+	text = strings.Replace(text, "&", "&amp;", -1)
+	text = strings.Replace(text, "<", "&lt;", -1)
+	text = strings.Replace(text, ">", "&gt;", -1)
+	if inLine {
+		text = strings.Replace(text, "\"", "&quot;", -1)
+	}
+	return text
+}
+
+func unescape(text string) string {
+	text = strings.Replace(text, "&lt;", "<", -1)
+	text = strings.Replace(text, "&gt;", ">", -1)
+	text = strings.Replace(text, "&quot;", "\"", -1)
+
+	re := regexp.MustCompile(`&#(\d+);`)
+	text = re.ReplaceAllStringFunc(text, func(s string) string {
+		matches := re.FindStringSubmatch(s)
+		if matches[1] == "38" {
+			return s
+		}
+		i, _ := strconv.Atoi(matches[1])
+		return fmt.Sprint(i)
+	})
+
+	re = regexp.MustCompile("&#x([0-9a-f]+);")
+	text = re.ReplaceAllStringFunc(text, func(s string) string {
+		matches := re.FindStringSubmatch(s)
+		if matches[1] == "26" {
+			return s
+		}
+		i, _ := strconv.ParseInt(matches[1], 16, 32)
+		return fmt.Sprint(i)
+	})
+
+	re = regexp.MustCompile("&(amp|#38|#x26);")
+	text = re.ReplaceAllString(text, "&")
+
+	return text
+}
+
+type Token struct {
+	*html.Token
+	extra string
+}
+
+func (t *Token) parseAttributes() []html.Attribute {
+	if t.extra == "" {
+		return nil
+	}
+	t.Attr = []html.Attribute{}
+	for {
+		attrLoc := attrPat.FindStringSubmatchIndex(t.extra)
+		if attrLoc == nil {
+			break
+		}
+
+		matches := attrPat.FindStringSubmatch(t.extra)
+		t.extra = t.extra[attrLoc[1]:]
+
+		key := matches[1]
+		var value string
+		if matches[2] != "" {
+			value = matches[2]
+		} else {
+			value = matches[3]
+		}
+		if value != "" {
+			t.Attr = append(t.Attr, html.Attribute{
+				Key: key,
+				Val: unescape(value),
+			})
+		} else if strings.HasPrefix(key, "no-") {
+			t.Attr = append(t.Attr, html.Attribute{
+				Key: key[3:],
+				Val: "false",
+			})
+		} else {
+			t.Attr = append(t.Attr, html.Attribute{
+				Key: key,
+				Val: "true",
+			})
+		}
+	}
+	return t.Attr
+}
+
+func parseTokens(tokens []Token) *html.Node {
+	var stack = []*html.Node{}
+	var root *html.Node = &html.Node{
+		Type: html.DocumentNode,
+		Data: "body",
+	}
+	stack = append(stack, root)
+
+	for _, token := range tokens {
+		switch token.Type {
+		case html.TextToken:
+			if len(stack) > 0 {
+				node := &html.Node{
+					Type: html.TextNode,
+					Data: token.Data,
+				}
+				stack[len(stack)-1].AppendChild(node)
+			}
+		case html.StartTagToken:
+			node := &html.Node{
+				Type: html.ElementNode,
+				Data: token.Data,
+				Attr: token.parseAttributes(),
+			}
+			if len(stack) > 0 {
+				stack[len(stack)-1].AppendChild(node)
+			} else {
+				root = node
+			}
+			stack = append(stack, node)
+		case html.EndTagToken:
+			if token.Data == stack[len(stack)-1].Data {
+				stack = stack[:len(stack)-1]
+			}
+		case html.SelfClosingTagToken:
+			node := &html.Node{
+				Type: html.ElementNode,
+				Data: token.Data,
+				Attr: token.parseAttributes(),
+			}
+			if len(stack) > 0 {
+				stack[len(stack)-1].AppendChild(node)
+			} else {
+				root = node
+			}
+		}
+	}
+	return root
+}
+
+func xhtmlParse(source string) *html.Node {
+	var tokens = []Token{}
+
+	var pushText = func(text string) {
+		if text != "" {
+			tokens = append(tokens, Token{
+				Token: &html.Token{
+					Type: html.TextToken,
+					Data: text,
+				},
+			})
+		}
+	}
+
+	var parseContent = func(source string, start, end bool) {
+		source = unescape(source)
+		if start {
+			re := regexp.MustCompile(`^\s*\n\s*`)
+			source = re.ReplaceAllString(source, "")
+		}
+		if end {
+			re := regexp.MustCompile(`\s*\n\s*$`)
+			source = re.ReplaceAllString(source, "")
+		}
+		pushText(source)
+	}
+
+	for {
+		tagLoc := tagPat.FindStringSubmatchIndex(source)
+		if tagLoc == nil {
+			break
+		}
+
+		parseContent(source[:tagLoc[0]], true, true)
+		matches := tagPat.FindStringSubmatch(source)
+		source = source[tagLoc[1]:]
+		close, type_, extra, empty := matches[3], matches[4], matches[5], matches[6]
+		if matches[1] != "" { // comment
+			continue
+		}
+		var token Token
+		if close == "" && empty == "" { // 开始标记
+			token = Token{
+				Token: &html.Token{
+					Type: html.StartTagToken,
+					Data: type_,
+				},
+				extra: extra,
+			}
+		} else if close != "" { // 结束标记
+			token = Token{
+				Token: &html.Token{
+					Type: html.EndTagToken,
+					Data: type_,
+				},
+				extra: extra,
+			}
+		} else if empty != "" { // 自闭合标记
+			token = Token{
+				Token: &html.Token{
+					Type: html.SelfClosingTagToken,
+					Data: type_,
+				},
+				extra: extra,
+			}
+		}
+		tokens = append(tokens, token)
+	}
+
+	parseContent(source, true, true)
+	return parseTokens(tokens)
+}