Permalink
Browse files

parsing everything now for the 2000 to date awards. Just have a littl…

…e clean up left to do.
  • Loading branch information...
1 parent bc04d67 commit 166498986fbe5338ae5873cdc885edf0b021d05f @rippinrobr committed Nov 12, 2011
Showing with 14 additions and 16 deletions.
  1. +2 −1 hugoclr/data/csv.clj
  2. +12 −15 hugoclr/parser.clj
View
@@ -2,12 +2,13 @@
(defn delimit
[year books]
- (map #(str year "," (:winner %) "," (:title %) "," (:publisher %) "\r\n") books))
+ (map #(str year "," (:winner %) "," (:title %) "," (:author %) "," (:publisher %) "\r\n") books))
(defn write-to-file
[categories file-name]
(let [stream (System.IO.StreamWriter. file-name)
lines (map #(delimit (:year (first %)) (:books (first %))) categories)]
+ (print lines)
(.Write stream (apply str (map #(reduce str "" %) lines)))
(.Close stream)))
View
@@ -14,11 +14,12 @@
(defn my-regex
[pattern target-str]
- (second (re-matches pattern target-str)))
+ (last (re-matches pattern target-str)))
(defn get-html-elements [url xpath]
"Gets all <a> that match the xpath and returns a collection of .NET objects"
- (.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) xpath))
+ (let [nodes (.SelectNodes (.DocumentNode (hugoclr.parser/fetch-url url)) xpath)]
+ nodes))
(defn validate-award-link
"Filters out all non-award links so that I only retrieve pages that list nominees and
@@ -33,7 +34,6 @@
(defn get-work-author-and-publisher
[li-node]
- ;; (println (nil? li-node))
(if (nil? (.ChildNodes li-node))
(.InnerHtml li-node)
(.InnerHtml (second (.ChildNodes li-node)))))
@@ -46,36 +46,33 @@
(if (and (not (nil? (.Attributes li-node))) (> (.Count (.Attributes li-node)) 0))
(= "winner" (.Value (first (.Attributes li-node))))
false))
+
(defn create-work-struct
[li-node]
+ (println (.InnerHtml li-node))
(struct work (check-for-winner li-node) (get-work-title li-node)
- (my-regex #".*,\s+(.*)\s+[\(\[].*" (.InnerHtml li-node))
+ (my-regex #".*</em>\s*(by|,)\s+(.*)\s+[\[\(].*" (.InnerHtml li-node))
(my-regex #".*[\(\[](.*)[\)\]].*" (.InnerHtml li-node))))
(defn create-works-seq
- [lis]
- (map create-work-struct (seq lis)))
- ;; (map #(struct work (check-for-winner %) (get-work-title %) (my-regex #"^,\s+(.*)\s+[\(\]].*" (get-work-author-and-publisher %)) "") (seq lis)))
-
+ [lis] (map create-work-struct (seq lis)))
+
(defn create-category-struct
- [p-node]
- (let [ul (.NextSibling (.NextSibling p-node))
+ [ul]
+ (let [p-node (.PreviousSibling (.PreviousSibling ul))
lis (filter #(= "li" (.OriginalName %)) (rest (.ChildNodes ul)))]
(struct category (get-category-heading p-node) (create-works-seq (seq lis)) (get-year p-node))))
(defn parse-awards-page
"Gets all the book related sections of the web page. The first 5 items are book related."
[award-url]
- ;; this will get the author and publisher of the work: (.InnerHtml (first (.ChildNodes (second (.ChildNodes ul)))))
- (let [top-node (get-html-elements award-url "//div[@id='content']/p[not(@class)][2]")]
- ;; top-node))
- (map create-category-struct top-node)))
+ (let [top-node (get-html-elements award-url "//div[@id='content']/ul")]
+ (map create-category-struct top-node)))
(defn get-awards
"gets all the links from the http://www.thehugoawards.org/hugo-history/ that lead to a awards page"
[url]
(let [links (get-html-elements url "//li[@class]/a[@href]")
award-links (filter #(not (nil? %)) (map validate-award-link links))]
- ;; award-links))
(map parse-awards-page (take 12 award-links))))

0 comments on commit 1664989

Please sign in to comment.