Skip to content

Commit

Permalink
Don't scrape websites recursively
Browse files Browse the repository at this point in the history
  • Loading branch information
raviqqe committed Apr 23, 2018
1 parent a75cc0e commit 14aa8c9
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 12 deletions.
29 changes: 17 additions & 12 deletions checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ var validSchemes = map[string]struct{}{

type checker struct {
fetcher fetcher
rootPage page
daemons daemons
hostname string
results chan result
donePages concurrentStringSet
}
Expand All @@ -32,15 +33,25 @@ func newChecker(s string, c int) (checker, error) {
return checker{}, err
}

return checker{f, *p, make(chan result, c), newConcurrentStringSet()}, nil
ch := checker{
f,
newDaemons(c),
p.URL().Hostname(),
make(chan result, c),
newConcurrentStringSet(),
}

ch.daemons.Add(func() { ch.checkPage(*p) })

return ch, nil
}

func (c checker) Results() <-chan result {
return c.results
}

func (c checker) Check() {
c.checkPage(c.rootPage)
c.daemons.Run()

close(c.results)
}
Expand All @@ -58,7 +69,6 @@ func (c checker) checkPage(p page) {
})

sc, ec := make(chan string, len(ns)), make(chan string, len(ns))
v := sync.WaitGroup{}
w := sync.WaitGroup{}

for _, n := range ns {
Expand Down Expand Up @@ -87,13 +97,10 @@ func (c checker) checkPage(p page) {
if err == nil {
sc <- u.String()

if p != nil && !c.donePages.Add(p.URL().String()) && p.URL().Hostname() == c.rootPage.URL().Hostname() {
v.Add(1)

go func() {
if p != nil && !c.donePages.Add(p.URL().String()) && p.URL().Hostname() == c.hostname {
c.daemons.Add(func() {
c.checkPage(*p)
v.Done()
}()
})
}
} else {
ec <- fmt.Sprintf("%v (%v)", u, err)
Expand All @@ -104,8 +111,6 @@ func (c checker) checkPage(p page) {
w.Wait()

c.results <- newResult(p.URL().String(), stringChannelToSlice(sc), stringChannelToSlice(ec))

v.Wait()
}

func stringChannelToSlice(sc <-chan string) []string {
Expand Down
31 changes: 31 additions & 0 deletions daemons.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package main

import "sync"

type daemons struct {
daemons chan func()
waitGroup *sync.WaitGroup
}

func newDaemons(c int) daemons {
return daemons{make(chan func(), c), &sync.WaitGroup{}}
}

func (ds daemons) Add(f func()) {
ds.waitGroup.Add(1)

ds.daemons <- func() {
f()
ds.waitGroup.Done()
}
}

func (ds daemons) Run() {
go func() {
for f := range ds.daemons {
go f()
}
}()

ds.waitGroup.Wait()
}
29 changes: 29 additions & 0 deletions daemons_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package main

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestNewDaemons(t *testing.T) {
newDaemons(1)
}

func TestDaemonsAdd(t *testing.T) {
ds := newDaemons(42)
ds.Add(func() {})

assert.Equal(t, 1, len(ds.daemons))
}

func TestDaemonsRun(t *testing.T) {
x := 0

ds := newDaemons(42)
ds.Add(func() { x++ })
ds.Run()

assert.Equal(t, 1, x)
assert.Zero(t, len(ds.daemons))
}

0 comments on commit 14aa8c9

Please sign in to comment.