diff --git a/composition/html_content_parser.go b/composition/html_content_parser.go index eae1fba..2c5ae3b 100644 --- a/composition/html_content_parser.go +++ b/composition/html_content_parser.go @@ -273,25 +273,32 @@ func ParseHeadFragment(fragment *StringFragment, headPropertyMap map[string]stri return z.Err() } break forloop + case tt == html.StartTagToken || tt == html.SelfClosingTagToken: - if string(tag) == "meta" { - if(processMetaTag(string(tag), attrs, headPropertyMap)) { - headBuff.Write(raw) - } - continue - } - if string(tag) == "title" { - if(headPropertyMap["title"] == "") { - headPropertyMap["title"] = "title" - headBuff.Write(raw) - } else if (tt != html.SelfClosingTagToken) { - skipCompleteTag(z, "title") - continue - } - } else { - headBuff.Write(raw) - } + switch { + case string(tag) == "meta": + if (processMetaTag(string(tag), attrs, headPropertyMap)) { + headBuff.Write(raw) + } + continue forloop + case string(tag) == "link": + if (processLinkTag(attrs, headPropertyMap)) { + headBuff.Write(raw) + } + continue forloop + case string(tag) == "title": + if (headPropertyMap["title"] == "") { + headPropertyMap["title"] = "title" + headBuff.Write(raw) + } else if (tt != html.SelfClosingTagToken) { + skipCompleteTag(z, "title") + } + continue forloop + default: + headBuff.Write(raw) + } + default: headBuff.Write(raw) } @@ -328,8 +335,6 @@ func skipCompleteTag(z *html.Tokenizer, tagName string) error { return nil } - - func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]string) bool { if (len(attrs) == 0) { return true @@ -337,14 +342,14 @@ func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]s key := tagName value := "" - // TODO: check explizit for attrName "http-equiv" || "name" || "charset" ? - // e.g.: + // e.g.: => key = meta_charset; val = utf-8 if (len(attrs) == 1) { key = tagName + "_" + attrs[0].Key value = attrs[0].Val } + // e.g.: => key = meta_name_content-language; val = content_de if (len(attrs) > 1) { key = tagName + "_" + attrs[0].Key + "_" + attrs[0].Val value = attrs[1].Key + "_" + attrs[1].Val @@ -353,11 +358,43 @@ func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]s if (metaMap[key] == "") { metaMap[key] = value return true - } return false } +/** +Returns true if a link tag can be processed. +Checks if a tag contains a canonical relation and avoids multiple canonical definitions. + */ +func processLinkTag(attrs []html.Attribute, metaMap map[string]string) bool { + if (len(attrs) == 0) { + return true + } + + const canonical = "canonical" + var key string + var value string + + // e.g.: => key = canonical; val = /baumarkt/suche + for _, attr := range attrs { + if (attr.Key == "rel" && attr.Val == canonical) { + key = canonical + } + if (attr.Key == "href") { + value = attr.Val + } + } + if (key == canonical && metaMap[canonical] != "") { + // if canonical is already in map then don't process this link tag + return false + } + + if (key != "" && value != "") { + metaMap[key] = value + } + return true +} + func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { diff --git a/composition/html_content_parser_test.go b/composition/html_content_parser_test.go index 93ffef8..f6eafec 100644 --- a/composition/html_content_parser_test.go +++ b/composition/html_content_parser_test.go @@ -15,15 +15,12 @@ var productUiGeneratedHtml = `