Skip to content

Commit

Permalink
update: 使用自行实现的元素解析方法,而不再使用 html.Parse
Browse files Browse the repository at this point in the history
  • Loading branch information
WindowsSov8forUs committed Apr 12, 2024
1 parent 086f66e commit 63fa124
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 2 deletions.
3 changes: 1 addition & 2 deletions pkg/message/parser.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package message

import (
"bytes"
"fmt"
"strings"

Expand Down Expand Up @@ -89,7 +88,7 @@ func parseHtmlChildrenNode(n *html.Node, callback func(e MessageElement)) error
}

func Parse(source string) ([]MessageElement, error) {
doc, _ := html.Parse(bytes.NewReader([]byte(source)))
doc := xhtmlParse(source)
var result []MessageElement
err := parseHtmlNode(doc, func(e MessageElement) {
if e != nil {
Expand Down
225 changes: 225 additions & 0 deletions pkg/message/xhtml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
package message

import (
"fmt"
"regexp"
"strconv"
"strings"

"golang.org/x/net/html"
)

var (
tagPat = regexp.MustCompile(`(<!--[\s\S]*?-->)|(<(/?)([^!\s>/]*)([^>]*?)\s*(/?)>)`)
attrPat = regexp.MustCompile(`([^\s=]+)(?:="([^"]*)"|='([^']*)')?`)
)

func escape(text string, inLine bool) string {
text = strings.Replace(text, "&", "&amp;", -1)
text = strings.Replace(text, "<", "&lt;", -1)
text = strings.Replace(text, ">", "&gt;", -1)
if inLine {
text = strings.Replace(text, "\"", "&quot;", -1)
}
return text
}

func unescape(text string) string {
text = strings.Replace(text, "&lt;", "<", -1)
text = strings.Replace(text, "&gt;", ">", -1)
text = strings.Replace(text, "&quot;", "\"", -1)

re := regexp.MustCompile(`&#(\d+);`)
text = re.ReplaceAllStringFunc(text, func(s string) string {
matches := re.FindStringSubmatch(s)
if matches[1] == "38" {
return s
}
i, _ := strconv.Atoi(matches[1])
return fmt.Sprint(i)
})

re = regexp.MustCompile("&#x([0-9a-f]+);")
text = re.ReplaceAllStringFunc(text, func(s string) string {
matches := re.FindStringSubmatch(s)
if matches[1] == "26" {
return s
}
i, _ := strconv.ParseInt(matches[1], 16, 32)
return fmt.Sprint(i)
})

re = regexp.MustCompile("&(amp|#38|#x26);")
text = re.ReplaceAllString(text, "&")

return text
}

type Token struct {
*html.Token
extra string
}

func (t *Token) parseAttributes() []html.Attribute {
if t.extra == "" {
return nil
}
t.Attr = []html.Attribute{}
for {
attrLoc := attrPat.FindStringSubmatchIndex(t.extra)
if attrLoc == nil {
break
}

matches := attrPat.FindStringSubmatch(t.extra)
t.extra = t.extra[attrLoc[1]:]

key := matches[1]
var value string
if matches[2] != "" {
value = matches[2]
} else {
value = matches[3]
}
if value != "" {
t.Attr = append(t.Attr, html.Attribute{
Key: key,
Val: unescape(value),
})
} else if strings.HasPrefix(key, "no-") {
t.Attr = append(t.Attr, html.Attribute{
Key: key[3:],
Val: "false",
})
} else {
t.Attr = append(t.Attr, html.Attribute{
Key: key,
Val: "true",
})
}
}
return t.Attr
}

func parseTokens(tokens []Token) *html.Node {
var stack = []*html.Node{}
var root *html.Node = &html.Node{
Type: html.DocumentNode,
Data: "body",
}
stack = append(stack, root)

for _, token := range tokens {
switch token.Type {
case html.TextToken:
if len(stack) > 0 {
node := &html.Node{
Type: html.TextNode,
Data: token.Data,
}
stack[len(stack)-1].AppendChild(node)
}
case html.StartTagToken:
node := &html.Node{
Type: html.ElementNode,
Data: token.Data,
Attr: token.parseAttributes(),
}
if len(stack) > 0 {
stack[len(stack)-1].AppendChild(node)
} else {
root = node
}
stack = append(stack, node)
case html.EndTagToken:
if token.Data == stack[len(stack)-1].Data {
stack = stack[:len(stack)-1]
}
case html.SelfClosingTagToken:
node := &html.Node{
Type: html.ElementNode,
Data: token.Data,
Attr: token.parseAttributes(),
}
if len(stack) > 0 {
stack[len(stack)-1].AppendChild(node)
} else {
root = node
}
}
}
return root
}

func xhtmlParse(source string) *html.Node {
var tokens = []Token{}

var pushText = func(text string) {
if text != "" {
tokens = append(tokens, Token{
Token: &html.Token{
Type: html.TextToken,
Data: text,
},
})
}
}

var parseContent = func(source string, start, end bool) {
source = unescape(source)
if start {
re := regexp.MustCompile(`^\s*\n\s*`)
source = re.ReplaceAllString(source, "")
}
if end {
re := regexp.MustCompile(`\s*\n\s*$`)
source = re.ReplaceAllString(source, "")
}
pushText(source)
}

for {
tagLoc := tagPat.FindStringSubmatchIndex(source)
if tagLoc == nil {
break
}

parseContent(source[:tagLoc[0]], true, true)
matches := tagPat.FindStringSubmatch(source)
source = source[tagLoc[1]:]
close, type_, extra, empty := matches[3], matches[4], matches[5], matches[6]
if matches[1] != "" { // comment
continue
}
var token Token
if close == "" && empty == "" { // 开始标记
token = Token{
Token: &html.Token{
Type: html.StartTagToken,
Data: type_,
},
extra: extra,
}
} else if close != "" { // 结束标记
token = Token{
Token: &html.Token{
Type: html.EndTagToken,
Data: type_,
},
extra: extra,
}
} else if empty != "" { // 自闭合标记
token = Token{
Token: &html.Token{
Type: html.SelfClosingTagToken,
Data: type_,
},
extra: extra,
}
}
tokens = append(tokens, token)
}

parseContent(source, true, true)
return parseTokens(tokens)
}

0 comments on commit 63fa124

Please sign in to comment.