Permalink
Browse files

added the get-awards function which is the function that will return …

…all of the parsed information in a format that the caller can use to store. The validate-award-link now only returns valid annual award pages.
  • Loading branch information...
1 parent a1bd480 commit 0d09e1ca67a5403acc9ee4105666e110be42882e @rippinrobr committed Oct 31, 2011
Showing with 15 additions and 8 deletions.
  1. +2 −4 hugoclr.clj
  2. +13 −4 hugoclr/parser.clj
View
@@ -9,10 +9,8 @@
(def base-url "http://www.thehugoawards.org/hugo-history/")
(defn print-href-value [anchor]
- (println (.Value (first (.Attributes anchor)))))
+ (.Value (first (.Attributes anchor))))
(defn -main [& args]
(println (str "base-url: " base-url))
- (let [nodes (hugoclr.parser/get-award-links hugoclr/base-url)]
- (println (apply str (map print-href-value nodes)))))
- ;; (println (.Value (first (.Attributes (first nodes)))))))
+ (hugoclr.parser/get-awards base-url))
View
@@ -3,14 +3,23 @@
(System.Reflection.Assembly/LoadWithPartialName "HtmlAgilityPack")
(ns hugoclr.parser )
-
+
(defstruct work :winner :title :author)
(defstruct category :award :books :year)
(defn fetch-url [url]
- (println (str "retrieving " url "..."))
+ "fetches the web page and converts it into a .NET object"
(.Load (new HtmlAgilityPack.HtmlWeb) url))
-(defn get-award-links [url]
+(defn get-links [url]
+ "Gets all <a> that match the xpath and returns a collection of .NET objects"
(.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) "//li[@class]/a[@href]"))
- ;; "//li[@class]/a[@href]"))
+
+(defn validate-award-link
+ "Filters out all non-award links so that I only retrieve pages that list nominees and
+ winners"
+ [url] (re-matches #".*hugo-history.*/.+" (.Value (first (.Attributes url)))))
+
+(defn get-awards [url]
+ (let [links (get-links url)]
+ (apply str (println (filter #(not (nil? %)) (map validate-award-link links))))))

0 comments on commit 0d09e1c

Please sign in to comment.