-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.go
92 lines (78 loc) · 2.1 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// Package bslc provides an IP bound crawler with channel based delivery of
// crawled content.
package bslc
import (
"net/http"
"time"
)
// Crawler does the heavy lifting of the actual crawling. URLs is the URLContainer
// used for bookkeeping and must be initialized. MaxConcurrentConnections is optional;
// the default is 5 concurrent transfers.
type Crawler struct {
URLs URLContainer
MaxConcurrentConnections int
responseHandler
transfersCounter
isCrawling bool
}
// StartCrawling starts the crawling process.
func (c *Crawler) StartCrawling() {
if c.MaxConcurrentConnections == 0 {
c.MaxConcurrentConnections = 5
}
c.initializeCounter()
c.registerHTMLHandler()
c.isCrawling = true
urls := make(chan string)
// URL processors
for i := 0; i < c.MaxConcurrentConnections; i++ {
go func() {
for uri := range urls {
c.transferStart()
c.processURL(uri)
c.transferEnd()
}
}()
}
// URL dispatcher
go func() {
for c.isCrawling {
uri, err := c.URLs.NextURL()
if err != nil {
<-time.After(time.Second * 2)
if c.ActiveTransfers() != 0 || c.URLs.Len() != 0 {
continue
} else {
break
}
}
urls <- uri
}
close(urls)
c.closeHandlers()
}()
}
// StopCrawling stops the crawling process. Transfers in progress will be completed.
func (c *Crawler) StopCrawling() {
c.isCrawling = false
}
func (c *Crawler) registerHTMLHandler() {
ch := make(chan *Content)
c.AddMimeType("text/html", ch)
for i := 0; i < c.MaxConcurrentConnections; i++ {
go func() {
for content := range ch {
parseHTML(content, c.URLs)
content.Done <- true
}
}()
}
}
func (c *Crawler) processURL(uri string) {
res, err := http.Get(uri)
if err != nil {
return
}
defer res.Body.Close()
c.sendResponse(res)
}