Skip to content

Commit

Permalink
parser.clr has been cleaned and commented
Browse files Browse the repository at this point in the history
  • Loading branch information
rippinrobr committed Nov 13, 2011
1 parent 3d15b27 commit 50df294
Showing 1 changed file with 43 additions and 29 deletions.
72 changes: 43 additions & 29 deletions hugoclr/parser.clj
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
;;---------------------------------------------------------------------------
;; parser.clj
;; The code in this file is used to parse the HUGO award pages and create a
;; a record for each nonimee/winner
;;
;; I ran gacutil -i HtmlAgilityPack in the libs dir as administrator
;; to load the dll into the gac for simplicity.
;;---------------------------------------------------------------------------
(System.Reflection.Assembly/LoadWithPartialName "HtmlAgilityPack")

(ns hugoclr.parser )

(defstruct work :winner :title :author :publisher)
(defstruct category :award :books :year)

;; stores the information about each nominee
(defrecord Work [winner title author publisher])

;; stores the nominees and winner(s) for a category
;; in this case the Best Novel category
(defrecord Category [award books year])

(defn fetch-url [url]
"fetches the web page and converts it into a .NET object"
"fetches the web page and converts it into a .NET object using the
HtmlAgilitPack assembly"
(println (str "fetching " url ))
(.Load (new HtmlAgilityPack.HtmlWeb) url))

(defn my-regex
[pattern target-str]
(last (re-matches pattern target-str)))

(defn get-html-elements [url xpath]
"Gets all <a> that match the xpath and returns a collection of .NET objects"
"Gets all <a> that match the xpath and returns a collection of .NET objects that
represents the <a> nodes"
(let [nodes (.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) xpath)]
nodes))

Expand All @@ -27,48 +35,54 @@
[url] (re-matches #".*hugo-history.*/.+" (.Value (first (.Attributes url)))))

(defn get-year
"Gets the year for the category being parsed. It retrieves the year from the <h2> tag.
It traverses the DOM heirarchy to get to the h2 tag and grab the text. Then it takes the
first 4 chars which represents the year."
[p-node] (apply str (take 4 (.InnerHtml (second (.ChildNodes (.ParentNode p-node)))))))

(defn get-work-title
"parses the books/works title from the em tags."
[li-node] (.InnerHtml (first (.ChildNodes li-node))))

(defn get-work-author-and-publisher
[li-node]
(if (nil? (.ChildNodes li-node))
(.InnerHtml li-node)
(.InnerHtml (second (.ChildNodes li-node)))))

(defn get-category-heading
[p-node] (.InnerHtml (first (.SelectNodes p-node "./strong"))))
(defn get-category-heading
"parses the category's title. We could use this code to grab all of the categories. I'm
only interested in the novels."
[p-node] (.InnerHtml (first (.SelectNodes p-node "./strong"))))

(defn check-for-winner
"checks to see if the class attribute of the li tag is set to winner. If so the work in
question was the winner in the HUGO category."
[li-node]
(if (and (not (nil? (.Attributes li-node))) (> (.Count (.Attributes li-node)) 0))
(= "winner" (.Value (first (.Attributes li-node))))
false))

(defn create-work-struct
(defn create-work-record
"Simply creates a record that represents a work "
[li-node]
(struct work (check-for-winner li-node) (get-work-title li-node)
(my-regex #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node))
(my-regex #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node))))
(Work. (check-for-winner li-node) (get-work-title li-node)
(last (re-matches #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node)))
(last (re-matches #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node)))))

(defn create-works-seq
[lis] (map create-work-struct (seq lis)))
"Creates all of the works in a given category represented by the sequence of li tags passed
in"
[lis] (map create-work-record (seq lis)))

(defn create-category-struct
(defn create-category-record
"Creates a record that represents the category represented by the passed in ul tag."
[ul]
(let [p-node (.PreviousSibling (.PreviousSibling ul))
lis (filter #(= "li" (.OriginalName %)) (rest (.ChildNodes ul)))]
(struct category (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))

(Category. (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))
(defn parse-awards-page
"Gets all the book related sections of the web page. The first 5 items are book related."
"Gets all the ul tags in the book related sections of the web page. After the page has been parsed
the category records are created."
[award-url]
(let [top-node (get-html-elements award-url "//div[@id='content']/ul")]
(map create-category-struct top-node)))


(map create-category-record top-node)))

(defn get-awards
"gets all the links from the http://www.thehugoawards.org/hugo-history/ that lead to a awards page"
[url]
Expand Down

0 comments on commit 50df294

Please sign in to comment.