Skip to content

Commit

Permalink
add first morph-based version
Browse files Browse the repository at this point in the history
  • Loading branch information
robbi5 committed Dec 5, 2017
1 parent 159936e commit 6663dbb
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 41 deletions.
8 changes: 5 additions & 3 deletions Gemfile
Expand Up @@ -4,7 +4,9 @@

source "https://rubygems.org"

ruby "2.0.0"
ruby '2.4.2'

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'mechanize', '~> 2.7.5'
gem 'addressable', '~> 2.5.2'
gem 'activesupport', '~> 5.1.4'
48 changes: 35 additions & 13 deletions Gemfile.lock
Expand Up @@ -10,38 +10,60 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
activesupport (5.1.4)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (~> 0.7)
minitest (~> 5.1)
tzinfo (~> 1.1)
addressable (2.5.2)
public_suffix (>= 2.0.2, < 4.0)
concurrent-ruby (1.0.5)
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mechanize (2.7.3)
i18n (0.9.1)
concurrent-ruby (~> 1.0)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.3.0)
minitest (5.10.3)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
nokogiri (1.8.1)
mini_portile2 (~> 2.3.0)
ntlm-http (0.1.1)
public_suffix (3.0.1)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
thread_safe (0.3.6)
tzinfo (1.2.4)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)
unf_ext (0.0.7.4)
webrobots (0.1.2)

PLATFORMS
ruby

DEPENDENCIES
mechanize
activesupport (~> 5.1.4)
addressable (~> 2.5.2)
mechanize (~> 2.7.5)
scraperwiki!

BUNDLED WITH
1.16.0
176 changes: 151 additions & 25 deletions scraper.rb
@@ -1,25 +1,151 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
require 'scraperwiki'
require 'mechanize'
require 'addressable/uri'
require 'date'
require 'json'
require 'digest/sha1'
require 'active_support/core_ext/hash/slice'

LIST_URL = 'https://www.bmi.bund.de/SiteGlobals/Forms/suche/gesetzgebungsverfahren-formular.html'

m = Mechanize.new
mp = m.get(LIST_URL)

BASE = mp.bases.first.href

results = []

loop do
result_items = mp.css('.c-search-teaser.Law')
result_items.each do |row|
link = row.css('.c-search-teaser__h a').first
title = link.text.strip
path = link.attributes['href']
url = Addressable::URI.join(BASE, path).normalize.to_s

x = {
title: title,
overview: url
}
results << x
end

link = mp.at_css('.navIndex .forward a')
break if link.nil?

path = link.attributes['href']
url = Addressable::URI.join(BASE, path).normalize.to_s

mp = m.get url
end

# Phase 2: scrape detail pages
headline2key = {
'Referentenentwurf' => :draft,
'Verbandsstellungnahme' => :statement,
'Verbändestellungnahmen' => :statement,
'Stellungnahmen' => :statement
}

def link_object(link)
title = link.text.strip
path = link.attributes['href'].to_s.strip
linkobj(title, Addressable::URI.join(BASE, path).normalize.to_s)
end

def linkobj(title, uri)
uri = Addressable::URI.parse(uri).normalize
url = uri.to_s

filename = "#{Digest::SHA256.hexdigest(url)[0...8]}_#{uri.basename.to_s}"

{
title: title,
url: url,
filename: filename
}
end

results.each do |row|
mp = m.get row[:overview]

headline = mp.at_css('.c-content-stage__headline')
row[:title] = headline.text.strip
row[:law] = []

linked = mp.css('.c-more__link')
linked.each do |link|
path = link.attributes['href'].to_s.strip
if path.include?('bgbl.de')
begin
title = link.text.strip
uri = Addressable::URI.join(path).normalize
query = uri.normalized_query
if query.include?('&jumpTo=')
jumpTo = query.match(/jumpTo=(.+?)(?:&|$)/)[1]
start = "%2F%2F%2A%5B%40attr_id%3D%27#{jumpTo}%27%5D"
else
start = query.match(/start=(.+?)(?:&|$)/)[1]
end
bgbluri = Addressable::URI.parse("https://www.bgbl.de/xaver/bgbl/text.xav?skin=pdf&start=#{start}").normalize.to_s
m.get(uri) # unlock session
bgblpage = m.get(bgbluri)
fakeimg = bgblpage.at_css('.xaver-PDF img')
pdfurl = fakeimg.attributes['src']
pdf = m.get(pdfurl)

a = linkobj(title, pdf.uri.to_s)
a[:source] = bgbluri
row[:law] << a
rescue
row[:law] << link_object(link)
end
else
row[:law] << link_object(link)
end
end

container = mp.at_css('.c-content-linklist__wrapper.row')
container.css('h3').each do |headline|
title = headline.text.strip
key = headline2key[title]
next if key.nil?

row[key] = []

links = headline.next_element.css('a')
links.each do |link|
row[key] << link_object(link)
end
end
end

# link buzer
results.each do |row|
row[:law].each do |law|
buzer_uri = nil
if law[:url].include?('bgbl.de')
um = law[:url].match(/\/bgbl[12](\d+)s(\d+)_(?:\d+).pdf$/)
next if um.nil?
buzer_uri = "https://www.buzer.de/s1.htm?a=&g=20#{um[1]}+#{um[2]}"
else
buzer_uri = Addressable::URI.parse("https://www.buzer.de/s1.htm?a=&g=#{row[:title]}").normalize.to_s
next
end
next if buzer_uri.nil?
law[:buzer] = buzer_uri

page = m.get(buzer_uri)
link = page.at_css('div.g a[href$="l.htm"]')
next if link.nil?
law[:buzer_diff] = link.attributes['href'].to_s.strip
end
end

results.each do |row|
key = row[:title].downcase.gsub(/[\s.\/_]/, ' ').squeeze(' ').strip.gsub(/[^\w-]/, '').tr(' ', '-')
ScraperWiki.save_sqlite([:key], row.slice(:title, :overview).merge({key: key}))
ScraperWiki.save_sqlite([:key], row[:law].map { |o| o.merge({ key: key }) }, 'law') unless row[:law].nil?
ScraperWiki.save_sqlite([:key], row[:draft].map { |o| o.merge({ key: key }) }, 'draft') unless row[:draft].nil?
ScraperWiki.save_sqlite([:key], row[:statement].map { |o| o.merge({ key: key }) }, 'statement') unless row[:statement].nil?
end

0 comments on commit 6663dbb

Please sign in to comment.