This repository has been archived by the owner on May 30, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
article.go
118 lines (102 loc) · 2.69 KB
/
article.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"strings"
)
const (
ARTICLE_TITLE_SELECTOR = ".gs_rt > a"
ARTICLE_HEADER_SELECTOR = ".gs_a"
ARTICLE_FOOTER_SELECTOR = ".gs_fl"
ARTICLE_SIDEBAR_SELECTOR = ".gs_md_wp > a"
)
type Article struct {
Title string
Year string
// Authors []string
URL string
ClusterId string
NumberOfCitations string
NumberOfVersions string
InfoId string
PDFLink string
PDFSource string
// Bibtex string
}
func NewArticle() *Article {
a := Article{}
return &a
}
func (a *Article) Parse(s *goquery.Selection, useBibTeX bool) {
a.parseTitle(s)
a.parseHeader(s)
a.parseFooter(s)
a.parseSideBar(s)
/*
if useBibTeX {
a.crawlAndParseBibTeX()
}
*/
}
func (a *Article) parseTitle(s *goquery.Selection) {
h3Title := s.Find(ARTICLE_TITLE_SELECTOR)
a.URL, _ = h3Title.Attr("href")
a.Title = h3Title.Text()
}
func (a *Article) parseHeader(s *goquery.Selection) {
a.Year = parseYear(s.Find(ARTICLE_HEADER_SELECTOR).Text())
}
func (a *Article) parseFooter(s *goquery.Selection) {
divFooter := s.Find(ARTICLE_FOOTER_SELECTOR)
parseFooter := func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
text := s.Text()
if strings.HasPrefix(href, "/scholar?cites") {
a.ClusterId = parseClusterId(href) // TODO: both
a.NumberOfCitations = parseNumberOfCitations(text)
}
if strings.HasPrefix(href, "/scholar?cluster") {
a.NumberOfVersions = parseNumberOfVersions(text) // TODO: fix
}
if strings.HasPrefix(href, "/scholar?q=related") {
a.InfoId = parseInfoId(href)
}
}
divFooter.Find("a").Each(parseFooter)
}
func (a *Article) parseSideBar(s *goquery.Selection) {
sideBarA := s.Find(ARTICLE_SIDEBAR_SELECTOR)
a.PDFLink, _ = sideBarA.Attr("href")
a.PDFSource = parsePDFSource(sideBarA.Text())
}
/*
func (a *Article) crawlAndParseBibTeX() {
popURL, err := CitePopUpQuery(a.InfoId)
if err != nil {
log.Fatal(err)
}
popDoc, err := goquery.NewDocument(popURL)
if err != nil {
log.Fatal(err)
}
bibURL, _ := popDoc.Find("#gs_citi > a:first-child").Attr("href")
bibDoc, err := goquery.NewDocument(SCHOLAR_URL + bibURL)
if err != nil {
log.Fatal(err)
}
a.Bibtex = bibDoc.Text()
}
*/
func (a *Article) dump() {
fmt.Println("title :", a.Title)
fmt.Println("year :", a.Year)
// fmt.Println("autho :", a.Author)
fmt.Println("url: ", a.URL)
fmt.Println("cluster_id: ", a.ClusterId)
fmt.Println("# of citations: ", a.NumberOfVersions)
fmt.Println("# of versions: ", a.NumberOfCitations)
fmt.Println("infor id: ", a.InfoId)
fmt.Println("pdfLink: ", a.PDFLink)
fmt.Println("pdfSource: ", a.PDFSource)
// fmt.Println("BibTeX: ", a.Bibtex)
}