/
parser.clj
88 lines (73 loc) · 3.77 KB
/
parser.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
;;---------------------------------------------------------------------------
;; parser.clj
;; The code in this file is used to parse the HUGO award pages and create a
;; a record for each nonimee/winner
;;---------------------------------------------------------------------------
(assembly-load-from "..\\libs\\HtmlAgilityPack.dll")
(ns hugoclr.parser )
;; stores the information about each nominee
(defrecord Work [winner title author publisher])
;; stores the nominees and winner(s) for a category
;; in this case the Best Novel category
(defrecord Category [award books year])
(defn fetch-url [url]
"fetches the web page and converts it into a .NET object using the
HtmlAgilitPack assembly"
(println (str "fetching " url ))
(.Load (new HtmlAgilityPack.HtmlWeb) url))
(defn- get-html-elements [url xpath]
"Gets all <a> that match the xpath and returns a collection of .NET objects that
represents the <a> nodes"
(let [nodes (.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) xpath)]
nodes))
(defn- validate-award-link
"Filters out all non-award links so that I only retrieve pages that list nominees and
winners."
[url] (re-matches #".*hugo-history.*/.+" (.Value (first (.Attributes url)))))
(defn- get-year
"Gets the year for the category being parsed. It retrieves the year from the <h2> tag.
It traverses the DOM heirarchy to get to the h2 tag and grab the text. Then it takes the
first 4 chars which represents the year."
[p-node] (apply str (take 4 (.InnerHtml (second (.ChildNodes (.ParentNode p-node)))))))
(defn- get-work-title
"parses the books/works title from the em tags."
[li-node] (.InnerHtml (first (.ChildNodes li-node))))
(defn- get-category-heading
"parses the category's title. We could use this code to grab all of the categories. I'm
only interested in the novels."
[p-node] (.InnerHtml (first (.SelectNodes p-node "./strong"))))
(defn- check-for-winner
"checks to see if the class attribute of the li tag is set to winner. If so the work in
question was the winner in the HUGO category."
[li-node]
(if (and (not (nil? (.Attributes li-node))) (> (.Count (.Attributes li-node)) 0))
(= "winner" (.Value (first (.Attributes li-node))))
false))
(defn- create-work-record
"Simply creates a record that represents a work "
[li-node]
(Work. (check-for-winner li-node) (get-work-title li-node)
(last (re-matches #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node)))
(last (re-matches #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node)))))
(defn- create-works-seq
"Creates all of the works in a given category represented by the sequence of li tags passed
in"
[lis] (map create-work-record (seq lis)))
(defn- create-category-record
"Creates a record that represents the category represented by the passed in ul tag."
[ul]
(let [p-node (.PreviousSibling (.PreviousSibling ul))
lis (filter #(= "li" (.OriginalName %)) (rest (.ChildNodes ul)))]
(Category. (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))
(defn parse-awards-page
"Gets all the ul tags in the book related sections of the web page. After the page has been parsed
the category records are created."
[award-url]
(let [top-node (get-html-elements award-url "//div[@id='content']/ul")]
(map create-category-record top-node)))
(defn get-awards
"gets all the links from the http://www.thehugoawards.org/hugo-history/ that lead to a awards page"
[url]
(let [links (get-html-elements url "//li[@class]/a[@href]")
award-links (filter #(not (nil? %)) (map validate-award-link links))]
(map parse-awards-page (take 12 award-links))))