initial version

everypolitician-scrapers · Sep 11, 2015 · 8de8c65 · 8de8c65
1 parent d56f696
commit 8de8c65
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 48 deletions.
diff --git a/Gemfile b/Gemfile
@@ -7,4 +7,5 @@ source "https://rubygems.org"
 ruby "2.0.0"
 
 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
-gem "mechanize"
+gem "nokogiri"
+gem "open-uri-cached"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -10,38 +10,22 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    domain_name (0.5.24)
-      unf (>= 0.0.5, < 1.0.0)
-    http-cookie (1.0.2)
-      domain_name (~> 0.5)
     httpclient (2.6.0.1)
-    mechanize (2.7.3)
-      domain_name (~> 0.5, >= 0.5.1)
-      http-cookie (~> 1.0)
-      mime-types (~> 2.0)
-      net-http-digest_auth (~> 1.1, >= 1.1.1)
-      net-http-persistent (~> 2.5, >= 2.5.2)
-      nokogiri (~> 1.4)
-      ntlm-http (~> 0.1, >= 0.1.1)
-      webrobots (>= 0.0.9, < 0.2)
-    mime-types (2.5)
     mini_portile (0.6.2)
-    net-http-digest_auth (1.4)
-    net-http-persistent (2.9.4)
     nokogiri (1.6.6.2)
       mini_portile (~> 0.6.0)
-    ntlm-http (0.1.1)
+    open-uri-cached (0.0.5)
     sqlite3 (1.3.10)
     sqlite_magic (0.0.3)
       sqlite3
-    unf (0.1.4)
-      unf_ext
-    unf_ext (0.0.7.1)
-    webrobots (0.1.1)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  mechanize
+  nokogiri
+  open-uri-cached
   scraperwiki!
+
+BUNDLED WITH
+   1.10.6
diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,62 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+# #!/bin/env ruby
+# encoding: utf-8
+
+require 'scraperwiki'
+require 'nokogiri'
+require 'open-uri/cached'
+require 'date'
+
+OpenURI::Cache.cache_path = '.cache'
+
+class String
+  def tidy
+    self.gsub(/[[:space:]]+/, ' ').strip
+  end
+end
+
+def noko_for(url)
+  Nokogiri::HTML(open(url).read)
+end
+
+def scrape_list(url)
+  noko = noko_for(url)
+  noko.css('div.delegate_list tr td a/@href').each do |link|
+    bio = URI.join(url, link.to_s)
+    scrape_person(bio)
+  end
+end
+
+def scrape_person(url)
+  noko = noko_for(url)
+  details = noko.css('div.optimize')
+
+  id = url.to_s.gsub(/^.*\.(\d{3})\.\d{3}.*$/, '\1')
+
+  name = details.css('h2').text.to_s.tidy
+  dob = details.xpath('//h4[contains(.,"Year of Birth")]/following-sibling::p[not(position() > 1)]/text()').to_s.tidy
+
+  start_date = details.xpath('//h4[contains(.,"Date of Verification of")]/following-sibling::p[not(position() > 1)]/text()').to_s.tidy
+  start_date = Date.parse(start_date).to_s
+
+  party = details.xpath('//h4[contains(.,"Political party")]/following-sibling::p[not(position() > 1)]/text()').to_s
+  party = party.gsub(/\(.*$/, '').tidy
+  party = '' if party == '-'
+
+  faction = details.xpath('//h4[contains(.,"Parliamentary group")]/following-sibling::p/a[not(position() > 1)]/text()').to_s
+  faction = faction.gsub('Read more ˃˃', '')
+  faction = faction.gsub('Parliamentary Group', '').tidy
+  faction = '' if faction == 'MPs not members of parliamentary groups'
+
+  data = {
+    id: id,
+    name: name,
+    faction: faction,
+    party: party,
+    start_date: start_date,
+    birth_date: dob
+  }
+
+  ScraperWiki.save_sqlite([:id], data)
+end
+
+scrape_list('http://www.parlament.gov.rs/national-assembly/composition/members-of-parliament/current-legislature.487.html')