Permalink
Browse files

parser.clr has been cleaned and commented

  • Loading branch information...
1 parent 3d15b27 commit 50df294a3802592ddb59dc26fcd622d9a4bbb72b @rippinrobr committed Nov 13, 2011
Showing with 43 additions and 29 deletions.
  1. +43 −29 hugoclr/parser.clj
View
@@ -1,23 +1,31 @@
+;;---------------------------------------------------------------------------
+;; parser.clj
+;; The code in this file is used to parse the HUGO award pages and create a
+;; a record for each nonimee/winner
+;;
;; I ran gacutil -i HtmlAgilityPack in the libs dir as administrator
;; to load the dll into the gac for simplicity.
+;;---------------------------------------------------------------------------
(System.Reflection.Assembly/LoadWithPartialName "HtmlAgilityPack")
(ns hugoclr.parser )
-
-(defstruct work :winner :title :author :publisher)
-(defstruct category :award :books :year)
+
+;; stores the information about each nominee
+(defrecord Work [winner title author publisher])
+
+;; stores the nominees and winner(s) for a category
+;; in this case the Best Novel category
+(defrecord Category [award books year])
(defn fetch-url [url]
- "fetches the web page and converts it into a .NET object"
+ "fetches the web page and converts it into a .NET object using the
+ HtmlAgilitPack assembly"
(println (str "fetching " url ))
(.Load (new HtmlAgilityPack.HtmlWeb) url))
-(defn my-regex
- [pattern target-str]
- (last (re-matches pattern target-str)))
-
(defn get-html-elements [url xpath]
- "Gets all <a> that match the xpath and returns a collection of .NET objects"
+ "Gets all <a> that match the xpath and returns a collection of .NET objects that
+ represents the <a> nodes"
(let [nodes (.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) xpath)]
nodes))
@@ -27,48 +35,54 @@
[url] (re-matches #".*hugo-history.*/.+" (.Value (first (.Attributes url)))))
(defn get-year
+ "Gets the year for the category being parsed. It retrieves the year from the <h2> tag.
+ It traverses the DOM heirarchy to get to the h2 tag and grab the text. Then it takes the
+ first 4 chars which represents the year."
[p-node] (apply str (take 4 (.InnerHtml (second (.ChildNodes (.ParentNode p-node)))))))
(defn get-work-title
+ "parses the books/works title from the em tags."
[li-node] (.InnerHtml (first (.ChildNodes li-node))))
-(defn get-work-author-and-publisher
- [li-node]
- (if (nil? (.ChildNodes li-node))
- (.InnerHtml li-node)
- (.InnerHtml (second (.ChildNodes li-node)))))
-
-(defn get-category-heading
- [p-node] (.InnerHtml (first (.SelectNodes p-node "./strong"))))
+(defn get-category-heading
+ "parses the category's title. We could use this code to grab all of the categories. I'm
+ only interested in the novels."
+ [p-node] (.InnerHtml (first (.SelectNodes p-node "./strong"))))
(defn check-for-winner
+ "checks to see if the class attribute of the li tag is set to winner. If so the work in
+ question was the winner in the HUGO category."
[li-node]
(if (and (not (nil? (.Attributes li-node))) (> (.Count (.Attributes li-node)) 0))
(= "winner" (.Value (first (.Attributes li-node))))
false))
-(defn create-work-struct
+(defn create-work-record
+ "Simply creates a record that represents a work "
[li-node]
- (struct work (check-for-winner li-node) (get-work-title li-node)
- (my-regex #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node))
- (my-regex #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node))))
+ (Work. (check-for-winner li-node) (get-work-title li-node)
+ (last (re-matches #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node)))
+ (last (re-matches #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node)))))
(defn create-works-seq
- [lis] (map create-work-struct (seq lis)))
+ "Creates all of the works in a given category represented by the sequence of li tags passed
+ in"
+ [lis] (map create-work-record (seq lis)))
-(defn create-category-struct
+(defn create-category-record
+ "Creates a record that represents the category represented by the passed in ul tag."
[ul]
(let [p-node (.PreviousSibling (.PreviousSibling ul))
lis (filter #(= "li" (.OriginalName %)) (rest (.ChildNodes ul)))]
- (struct category (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))
-
+ (Category. (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))
+
(defn parse-awards-page
- "Gets all the book related sections of the web page. The first 5 items are book related."
+ "Gets all the ul tags in the book related sections of the web page. After the page has been parsed
+ the category records are created."
[award-url]
(let [top-node (get-html-elements award-url "//div[@id='content']/ul")]
- (map create-category-struct top-node)))
-
-
+ (map create-category-record top-node)))
+
(defn get-awards
"gets all the links from the http://www.thehugoawards.org/hugo-history/ that lead to a awards page"
[url]

0 comments on commit 50df294

Please sign in to comment.