diff --git a/composition/html_content_parser.go b/composition/html_content_parser.go index eae1fba..2c5ae3b 100644 --- a/composition/html_content_parser.go +++ b/composition/html_content_parser.go @@ -273,25 +273,32 @@ func ParseHeadFragment(fragment *StringFragment, headPropertyMap map[string]stri return z.Err() } break forloop + case tt == html.StartTagToken || tt == html.SelfClosingTagToken: - if string(tag) == "meta" { - if(processMetaTag(string(tag), attrs, headPropertyMap)) { - headBuff.Write(raw) - } - continue - } - if string(tag) == "title" { - if(headPropertyMap["title"] == "") { - headPropertyMap["title"] = "title" - headBuff.Write(raw) - } else if (tt != html.SelfClosingTagToken) { - skipCompleteTag(z, "title") - continue - } - } else { - headBuff.Write(raw) - } + switch { + case string(tag) == "meta": + if (processMetaTag(string(tag), attrs, headPropertyMap)) { + headBuff.Write(raw) + } + continue forloop + case string(tag) == "link": + if (processLinkTag(attrs, headPropertyMap)) { + headBuff.Write(raw) + } + continue forloop + case string(tag) == "title": + if (headPropertyMap["title"] == "") { + headPropertyMap["title"] = "title" + headBuff.Write(raw) + } else if (tt != html.SelfClosingTagToken) { + skipCompleteTag(z, "title") + } + continue forloop + default: + headBuff.Write(raw) + } + default: headBuff.Write(raw) } @@ -328,8 +335,6 @@ func skipCompleteTag(z *html.Tokenizer, tagName string) error { return nil } - - func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]string) bool { if (len(attrs) == 0) { return true @@ -337,14 +342,14 @@ func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]s key := tagName value := "" - // TODO: check explizit for attrName "http-equiv" || "name" || "charset" ? - // e.g.: + // e.g.: => key = meta_charset; val = utf-8 if (len(attrs) == 1) { key = tagName + "_" + attrs[0].Key value = attrs[0].Val } + // e.g.: => key = meta_name_content-language; val = content_de if (len(attrs) > 1) { key = tagName + "_" + attrs[0].Key + "_" + attrs[0].Val value = attrs[1].Key + "_" + attrs[1].Val @@ -353,11 +358,43 @@ func processMetaTag(tagName string, attrs []html.Attribute, metaMap map[string]s if (metaMap[key] == "") { metaMap[key] = value return true - } return false } +/** +Returns true if a link tag can be processed. +Checks if a tag contains a canonical relation and avoids multiple canonical definitions. + */ +func processLinkTag(attrs []html.Attribute, metaMap map[string]string) bool { + if (len(attrs) == 0) { + return true + } + + const canonical = "canonical" + var key string + var value string + + // e.g.: => key = canonical; val = /baumarkt/suche + for _, attr := range attrs { + if (attr.Key == "rel" && attr.Val == canonical) { + key = canonical + } + if (attr.Key == "href") { + value = attr.Val + } + } + if (key == canonical && metaMap[canonical] != "") { + // if canonical is already in map then don't process this link tag + return false + } + + if (key != "" && value != "") { + metaMap[key] = value + } + return true +} + func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { diff --git a/composition/html_content_parser_test.go b/composition/html_content_parser_test.go index 93ffef8..f6eafec 100644 --- a/composition/html_content_parser_test.go +++ b/composition/html_content_parser_test.go @@ -15,15 +15,12 @@ var productUiGeneratedHtml = ` navigationservice - - - - + - - - - + @@ -295,7 +281,7 @@ func Test_HtmlContentParser_LoadEmptyContent(t *testing.T) { a.Nil(c.Tail()) } -func Test_HtmlContentParser_parseHead_withMultipleMetaTags_and_Titles(t *testing.T) { +func Test_HtmlContentParser_parseHead_withMultipleMetaTags_and_Titles_and_Canonicals(t *testing.T) { a := assert.New(t) parser := &HtmlContentParser{} @@ -306,8 +292,7 @@ func Test_HtmlContentParser_parseHead_withMultipleMetaTags_and_Titles(t *testing err := parser.parseHead(z, c) a.NoError(err) - //eqFragment(t, "xxxxxx", c.Head()) - //a.True(strings.Contains(string(c.Head()), "navigationservice")) + containsFragment(t, "navigationservice", c.Head()) } @@ -633,6 +618,20 @@ func eqFragment(t *testing.T, expected string, f Fragment) { } } +func containsFragment(t *testing.T, contained string, f Fragment) { + if f == nil { + t.Error("Fragment is nil, but expected:", contained) + return + } + sf := f.(StringFragment) + sfStripped := strings.Replace(string(sf), " ", "", -1) + sfStripped = strings.Replace(string(sfStripped), "\n", "", -1) + + if !strings.Contains(sfStripped, contained) { + t.Error("Fragment is not equal: \nexpected: ", contained, "\nactual: ", sf) + } +} + func Test_ParseHeadFragment_Filter_Title(t *testing.T) { a := assert.New(t) @@ -826,6 +825,125 @@ func Test_ParseHeadFragment_Filter_Meta_Tag(t *testing.T) { a.Equal(expectedParsedHead, resultString) } +func Test_ParseHeadFragment_Filter_Link_Canonical_Tag(t *testing.T) { + a := assert.New(t) + + // GIVEN + originalHeadString := ` + + + + navigationservice + + + + + + + ` + + expectedParsedHead := ` + + + + navigationservice + + + + + + + ` + + headMetaPropertyMap := make(map[string]string) + headMetaPropertyMap["canonical"] = "/baumarkt/suche" + + headFragment := StringFragment(originalHeadString) + // WHEN + ParseHeadFragment(&headFragment, headMetaPropertyMap) + + // THEN + expectedParsedHead = removeTabsAndNewLines(expectedParsedHead) + resultString := removeTabsAndNewLines(string(headFragment)) + + a.Equal(expectedParsedHead, resultString) +} + +func Test_ParseHeadFragment_Filter_Link_Canonical_Tag_without_existing_Map(t *testing.T) { + // GIVEN + a := assert.New(t) + + originalHeadString := ` + + + + + + + navigationservice + ` + + expectedParsedHead := ` + + + + + + navigationservice + ` + + headMetaPropertyMap := make(map[string]string) + + headFragment := StringFragment(originalHeadString) + // WHEN + ParseHeadFragment(&headFragment, headMetaPropertyMap) + + // THEN + expectedParsedHead = removeTabsAndNewLines(expectedParsedHead) + resultString := removeTabsAndNewLines(string(headFragment)) + + a.Equal(expectedParsedHead, resultString) +} + func removeTabsAndNewLines(stringToProcess string) string{ stringToProcess = strings.Replace(stringToProcess, "\n", "", -1) stringToProcess = strings.Replace(stringToProcess, "\t", "", -1)