Skip to content

Commit

Permalink
initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
struan committed Sep 11, 2015
1 parent d56f696 commit 8de8c65
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 48 deletions.
3 changes: 2 additions & 1 deletion Gemfile
Expand Up @@ -7,4 +7,5 @@ source "https://rubygems.org"
ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
gem "nokogiri"
gem "open-uri-cached"
28 changes: 6 additions & 22 deletions Gemfile.lock
Expand Up @@ -10,38 +10,22 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mechanize (2.7.3)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
ntlm-http (0.1.1)
open-uri-cached (0.0.5)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)

PLATFORMS
ruby

DEPENDENCIES
mechanize
nokogiri
open-uri-cached
scraperwiki!

BUNDLED WITH
1.10.6
87 changes: 62 additions & 25 deletions scraper.rb
@@ -1,25 +1,62 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
# #!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'open-uri/cached'
require 'date'

OpenURI::Cache.cache_path = '.cache'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
noko.css('div.delegate_list tr td a/@href').each do |link|
bio = URI.join(url, link.to_s)
scrape_person(bio)
end
end

def scrape_person(url)
noko = noko_for(url)
details = noko.css('div.optimize')

id = url.to_s.gsub(/^.*\.(\d{3})\.\d{3}.*$/, '\1')

name = details.css('h2').text.to_s.tidy
dob = details.xpath('//h4[contains(.,"Year of Birth")]/following-sibling::p[not(position() > 1)]/text()').to_s.tidy

start_date = details.xpath('//h4[contains(.,"Date of Verification of")]/following-sibling::p[not(position() > 1)]/text()').to_s.tidy
start_date = Date.parse(start_date).to_s

party = details.xpath('//h4[contains(.,"Political party")]/following-sibling::p[not(position() > 1)]/text()').to_s
party = party.gsub(/\(.*$/, '').tidy
party = '' if party == '-'

faction = details.xpath('//h4[contains(.,"Parliamentary group")]/following-sibling::p/a[not(position() > 1)]/text()').to_s
faction = faction.gsub('Read more ˃˃', '')
faction = faction.gsub('Parliamentary Group', '').tidy
faction = '' if faction == 'MPs not members of parliamentary groups'

data = {
id: id,
name: name,
faction: faction,
party: party,
start_date: start_date,
birth_date: dob
}

ScraperWiki.save_sqlite([:id], data)
end

scrape_list('http://www.parlament.gov.rs/national-assembly/composition/members-of-parliament/current-legislature.487.html')

0 comments on commit 8de8c65

Please sign in to comment.