forked from Qianlitp/crawlergo
/
collect_links.go
executable file
·73 lines (68 loc) · 1.99 KB
/
collect_links.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package engine
import (
"context"
"fmt"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
"github.com/shadow1ng/crawlergo/pkg/config"
"github.com/shadow1ng/crawlergo/pkg/logger"
"regexp"
"time"
)
/**
最后收集所有的链接
*/
func (tab *Tab) collectLinks() {
go tab.collectHrefLinks()
go tab.collectObjectLinks()
go tab.collectCommentLinks()
}
func (tab *Tab) collectHrefLinks() {
defer tab.collectLinkWG.Done()
ctx := tab.GetExecutor()
// 收集 src href data-url 属性值
attrNameList := []string{"src", "href", "data-url", "data-href"}
for _, attrName := range attrNameList {
tCtx, cancel := context.WithTimeout(ctx, time.Second*1)
var attrs []map[string]string
_ = chromedp.AttributesAll(fmt.Sprintf(`[%s]`, attrName), &attrs, chromedp.ByQueryAll).Do(tCtx)
cancel()
for _, attrMap := range attrs {
tab.AddResultUrl(config.GET, attrMap[attrName], config.FromDOM)
}
}
}
func (tab *Tab) collectObjectLinks() {
defer tab.collectLinkWG.Done()
ctx := tab.GetExecutor()
// 收集 object[data] links
tCtx, cancel := context.WithTimeout(ctx, time.Second*1)
defer cancel()
var attrs []map[string]string
_ = chromedp.AttributesAll(`object[data]`, &attrs, chromedp.ByQueryAll).Do(tCtx)
for _, attrMap := range attrs {
tab.AddResultUrl(config.GET, attrMap["data"], config.FromDOM)
}
}
func (tab *Tab) collectCommentLinks() {
defer tab.collectLinkWG.Done()
ctx := tab.GetExecutor()
// 收集注释中的链接
var nodes []*cdp.Node
tCtxComment, cancel := context.WithTimeout(ctx, time.Second*1)
defer cancel()
commentErr := chromedp.Nodes(`//comment()`, &nodes, chromedp.BySearch).Do(tCtxComment)
if commentErr != nil {
logger.Logger.Debug("get comment nodes err")
logger.Logger.Debug(commentErr)
return
}
urlRegex := regexp.MustCompile(config.URLRegex)
for _, node := range nodes {
content := node.NodeValue
urlList := urlRegex.FindAllString(content, -1)
for _, url := range urlList {
tab.AddResultUrl(config.GET, url, config.FromComment)
}
}
}