/
page.go
112 lines (92 loc) · 1.93 KB
/
page.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package page
import (
"encoding/json"
"io"
"io/ioutil"
"log"
// "log"
// "bufio"
"bytes"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/sakeven/spidergo/lib/request"
"golang.org/x/net/html/charset"
)
type Page struct {
Req *request.Request
Cookies []*http.Cookie
StatusCode int
ContentType string
OriCharset string
Err string
Failed bool
Raw []byte
Doc *goquery.Document
JsonMap map[string]string
Body string
NewReqs []*http.Request
}
func New(req *request.Request, res *http.Response) *Page {
defer func() {
if e := recover(); e != nil {
log.Println(e)
}
res.Body.Close()
}()
page := new(Page)
page.NewReqs = make([]*http.Request, 0)
page.ContentType = res.Header.Get("Content-type")
page.Cookies = res.Cookies()
page.StatusCode = res.StatusCode
page.Req = req
body, err := page.Iconv(res.Body)
if err != nil {
log.Println(err)
return nil
}
b, err := ioutil.ReadAll(body)
if err != nil {
log.Println(err)
return nil
}
page.Raw = b
contentType := page.ContentType
switch {
case contain(contentType, "text/html"):
page.ParseHtml()
case contain(contentType, "application/json"):
page.ParseJson()
case contain(contentType, "text/plain"):
page.ParseText()
default:
}
return page
}
func (p *Page) Iconv(reader io.Reader) (io.Reader, error) {
contentType := p.ContentType
switch {
case contain(contentType, "text"):
return charset.NewReader(reader, contentType)
}
return reader, nil
}
func (p *Page) AddReq(req *http.Request) {
p.NewReqs = append(p.NewReqs, req)
}
func (p *Page) ParseHtml() {
var err error
p.Doc, err = goquery.NewDocumentFromReader(bytes.NewReader(p.Raw))
if err != nil {
p.Err = err.Error()
}
}
func (p *Page) ParseJson() {
json.Unmarshal(p.Raw, &p.JsonMap)
}
func (p *Page) ParseText() {
p.Body = string(p.Raw)
}
func contain(src string, dst string) bool {
return strings.Index(src, dst) >= 0
}