Skip to content

sakeven/spidergo

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

25 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

spidergo

Go Walker GoDoc

🪲 A high performance spider(crawler) written in go.

Feature

  • Concurrent
  • Distributed
  • Support analyse html page and download binary file

Installation

go get github.com/sakeven/spidergo

Example

There is a example in github/sakeven/spidergo/example. Blow is a file from example.

package main

import (
	"log"
	"net/http"
	"os"
	"regexp"
	"runtime"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/sakeven/spidergo"
	"github.com/sakeven/spidergo/lib/analyser"
	"github.com/sakeven/spidergo/lib/page"
	"github.com/sakeven/spidergo/lib/result"
)

func main() {
	log.SetFlags(log.Ldate | log.Lshortfile | log.Ltime)
	runtime.GOMAXPROCS(runtime.NumCPU())
	req, _ := http.NewRequest("GET", "http://acm.hdu.edu.cn/listproblem.php?vol=1", nil)

	spidergo.New([]analyser.Analyser{NewAnalyser(), NewAnalyser()}).
		SetThreadNum(4).
		AddRequest(req).
		SetDelay(uint(100)).
		SetDepth(uint(4)).
		Run()
}

type Analyser struct {
}

func (a *Analyser) Analyse(pg *page.Page) *result.Result {
	if pg.Err != "" {
		log.Println(pg.Err)
		return nil
	}

	// log.Println(pg.Req.Req.URL.String())

	if pg.ContentType == "image/jpeg" {
		path := strings.Split(pg.Req.Req.URL.String(), "/")
		f, err := os.Create("out/" + path[len(path)-1])
		if err != nil {
			log.Println(err)
			return nil
		}
		defer f.Close()
		f.Write(pg.Raw)
		return nil
	}

	if pg.ContentType != "text/html" {
		return nil
	}
	pg.Doc.Find("a").Each(func(i int, se *goquery.Selection) {
		href, _ := se.Attr("href")

		if strings.HasPrefix(href, "list") {
			href = "http://acm.hdu.edu.cn/" + href
			req, err := http.NewRequest("GET", href, nil)
			if err != nil {
				log.Println(err)
				return
			}
			pg.AddReq(req)
		}

	})

	pg.Doc.Find("img").Each(func(i int, se *goquery.Selection) {
		href, _ := se.Attr("src")
		href = "http://acm.hdu.edu.cn/" + href
		href = page.FixUri(href)
		req, err := http.NewRequest("GET", href, nil)
		if err != nil {
			log.Println("req", err)
			return
		}
		pg.AddReq(req)
	})

	text := pg.Doc.Find("script").Text()
	titlePat := `p\((.*?)\);`
	titleRx := regexp.MustCompile(titlePat)
	match := titleRx.FindAllString(text, -1)
	for _, m := range match {
		pro := strings.Split(m, ",")
		if len(pro) >= 2 {
			href := "http://acm.hdu.edu.cn/showproblem.php?pid=" + pro[1]
			req, err := http.NewRequest("GET", href, nil)
			if err != nil {
				log.Println(err)
				continue
			}
			pg.AddReq(req)
		}
	}
	return nil
}

func NewAnalyser() *Analyser {
	return &Analyser{}
}

License

Under MIT

About

🪲 A spider(crawler) written in go.

Resources

License

Stars

Watchers

Forks

Packages

No packages published

Languages