-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.go
110 lines (100 loc) · 3.43 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// gofind: Simple in-memory search engine
//
// Scrapes the site(s) you point it at, creates an inverted index
// in memory that can be queried via API. It normalizes text by splitting
// on non-word runes, lowercasing, removing stop words, and then stemming
// with the Snowball algorithm for English.
//
// Configure with ENV vars:
// PORT (port number, defaults to 8080)
// START_URL (just one, like: https://your.site/docs)
// MAX_DEPTH (int: 0 means infinite, 1 means don't follow links at all ...)
// ALLOWED_DOMAINS (comma separated list of domain names: your.site,foo.co)
// DISALLOWED_DOMAINS (comma separated list of domains)
package main
import (
"github.com/gocolly/colly/v2"
"log"
"net/http"
"strings"
"sync"
"time"
)
const userAgent = "gofind/1.0"
var docs = make(docCache)
var docCacheLock sync.Mutex
var idx = make(index)
var indexLock sync.Mutex
// addCollectorHandlers accepts a web Collector and attaches callbacks
// for processing HTML as it's parsed async . For ease of chaining, returns
// the modified Collector. Includes capturing errors for logging.
func addCollectorHandlers(c *colly.Collector) *colly.Collector {
c.OnResponse(func(r *colly.Response) {
url := r.Request.URL.String()
log.Println("Processing", url)
docCacheLock.Lock()
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if _, exists := docs[link]; strings.Contains(link, "http") && !exists {
e.Request.Visit(link)
}
})
c.OnHTML("title", func(e *colly.HTMLElement) {
title := e.Text
url := e.Request.URL.String()
if doc, ok := docs[url]; ok {
doc.Title = title
docs[url] = doc
} else {
docs[url] = document{ID: len(docs), Title: title, URL: url}
}
})
c.OnHTML("p,h1,h2,h3,h4,h5,h6,ul,ol,td", func(e *colly.HTMLElement) {
text := e.Text
url := e.Request.URL.String()
if doc, ok := docs[url]; ok {
doc.Text += " " + text
docs[url] = doc
} else {
docs[url] = document{ID: len(docs), Text: text, URL: url}
}
})
c.OnScraped(func(r *colly.Response) {
url := r.Request.URL.String()
if url != "" && len(docs[url].Text) > 2 {
idx.add(docs[url])
}
docCacheLock.Unlock()
})
c.OnError(func(r *colly.Response, err error) {
url := r.Request.URL.String()
log.Printf("Error processing %v: %v\n", url, err)
})
return c
}
func main() {
initStopwordsList()
// Initialize the web crawler
c := addCollectorHandlers(colly.NewCollector())
c.Async = true
c.UserAgent = userAgent
c.MaxDepth = getEnvInt("MAX_DEPTH", 0)
c.AllowedDomains = getEnvSlice("ALLOWED_DOMAINS")
c.DisallowedDomains = getEnvSlice("DISALLOWED_DOMAINS")
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 100,
Delay: 5 * time.Millisecond,
})
// Start crawling, blocking until all goroutines have finished
startTime := time.Now()
c.Visit(getEnv("START_URL", ""))
c.Wait()
log.Printf("Done crawling %d pages in %s, ready to process queries\n",
len(docs), time.Since(startTime),
)
// Start serving the query endpoint until termination
http.HandleFunc("/", queryHandler)
log.Fatalln(http.ListenAndServe(":"+getEnv("PORT", "8080"), nil))
}