add first morph-based version

robbi5 · Dec 5, 2017 · 6663dbb · 6663dbb
1 parent 159936e
commit 6663dbb
Show file tree

Hide file tree

Showing 3 changed files with 191 additions and 41 deletions.
diff --git a/Gemfile b/Gemfile
@@ -4,7 +4,9 @@
 
 source "https://rubygems.org"
 
-ruby "2.0.0"
+ruby '2.4.2'
 
-gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
-gem "mechanize"
+gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
+gem 'mechanize', '~> 2.7.5'
+gem 'addressable', '~> 2.5.2'
+gem 'activesupport', '~> 5.1.4'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -10,38 +10,60 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    domain_name (0.5.24)
+    activesupport (5.1.4)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (~> 0.7)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+    addressable (2.5.2)
+      public_suffix (>= 2.0.2, < 4.0)
+    concurrent-ruby (1.0.5)
+    domain_name (0.5.20170404)
       unf (>= 0.0.5, < 1.0.0)
-    http-cookie (1.0.2)
+    http-cookie (1.0.3)
       domain_name (~> 0.5)
     httpclient (2.6.0.1)
-    mechanize (2.7.3)
+    i18n (0.9.1)
+      concurrent-ruby (~> 1.0)
+    mechanize (2.7.5)
       domain_name (~> 0.5, >= 0.5.1)
       http-cookie (~> 1.0)
-      mime-types (~> 2.0)
+      mime-types (>= 1.17.2)
       net-http-digest_auth (~> 1.1, >= 1.1.1)
       net-http-persistent (~> 2.5, >= 2.5.2)
-      nokogiri (~> 1.4)
+      nokogiri (~> 1.6)
       ntlm-http (~> 0.1, >= 0.1.1)
       webrobots (>= 0.0.9, < 0.2)
-    mime-types (2.5)
-    mini_portile (0.6.2)
-    net-http-digest_auth (1.4)
+    mime-types (3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2016.0521)
+    mini_portile2 (2.3.0)
+    minitest (5.10.3)
+    net-http-digest_auth (1.4.1)
     net-http-persistent (2.9.4)
-    nokogiri (1.6.6.2)
-      mini_portile (~> 0.6.0)
+    nokogiri (1.8.1)
+      mini_portile2 (~> 2.3.0)
     ntlm-http (0.1.1)
+    public_suffix (3.0.1)
     sqlite3 (1.3.10)
     sqlite_magic (0.0.3)
       sqlite3
+    thread_safe (0.3.6)
+    tzinfo (1.2.4)
+      thread_safe (~> 0.1)
     unf (0.1.4)
       unf_ext
-    unf_ext (0.0.7.1)
-    webrobots (0.1.1)
+    unf_ext (0.0.7.4)
+    webrobots (0.1.2)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  mechanize
+  activesupport (~> 5.1.4)
+  addressable (~> 2.5.2)
+  mechanize (~> 2.7.5)
   scraperwiki!
+
+BUNDLED WITH
+   1.16.0
diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,151 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+require 'scraperwiki'
+require 'mechanize'
+require 'addressable/uri'
+require 'date'
+require 'json'
+require 'digest/sha1'
+require 'active_support/core_ext/hash/slice'
+
+LIST_URL = 'https://www.bmi.bund.de/SiteGlobals/Forms/suche/gesetzgebungsverfahren-formular.html'
+
+m = Mechanize.new
+mp = m.get(LIST_URL)
+
+BASE = mp.bases.first.href
+
+results = []
+
+loop do
+	result_items = mp.css('.c-search-teaser.Law')
+	result_items.each do |row|
+		link = row.css('.c-search-teaser__h a').first
+		title = link.text.strip
+		path = link.attributes['href']
+		url = Addressable::URI.join(BASE, path).normalize.to_s
+
+		x = {
+			title: title,
+			overview: url
+		}
+		results << x
+	end
+
+	link = mp.at_css('.navIndex .forward a')
+	break if link.nil?
+
+	path = link.attributes['href']
+	url = Addressable::URI.join(BASE, path).normalize.to_s
+
+	mp = m.get url
+end
+
+# Phase 2: scrape detail pages
+headline2key = {
+	'Referentenentwurf' => :draft,
+	'Verbandsstellungnahme' => :statement,
+	'Verbändestellungnahmen' => :statement,
+	'Stellungnahmen' => :statement
+}
+
+def link_object(link)
+	title = link.text.strip
+	path = link.attributes['href'].to_s.strip
+	linkobj(title, Addressable::URI.join(BASE, path).normalize.to_s)
+end
+
+def linkobj(title, uri)
+	uri = Addressable::URI.parse(uri).normalize
+	url = uri.to_s
+
+	filename = "#{Digest::SHA256.hexdigest(url)[0...8]}_#{uri.basename.to_s}"
+
+	{
+		title: title,
+		url: url,
+		filename: filename
+	}
+end
+
+results.each do |row|
+	mp = m.get row[:overview]
+
+	headline = mp.at_css('.c-content-stage__headline')
+	row[:title] = headline.text.strip
+	row[:law] = []
+
+	linked = mp.css('.c-more__link')
+	linked.each do |link|
+		path = link.attributes['href'].to_s.strip
+		if path.include?('bgbl.de')
+			begin
+				title = link.text.strip
+				uri = Addressable::URI.join(path).normalize
+				query = uri.normalized_query
+				if query.include?('&jumpTo=')
+					jumpTo = query.match(/jumpTo=(.+?)(?:&|$)/)[1]
+					start = "%2F%2F%2A%5B%40attr_id%3D%27#{jumpTo}%27%5D"
+				else
+					start = query.match(/start=(.+?)(?:&|$)/)[1]
+				end
+				bgbluri = Addressable::URI.parse("https://www.bgbl.de/xaver/bgbl/text.xav?skin=pdf&start=#{start}").normalize.to_s
+				m.get(uri) # unlock session
+				bgblpage = m.get(bgbluri)
+				fakeimg = bgblpage.at_css('.xaver-PDF img')
+				pdfurl = fakeimg.attributes['src']
+				pdf = m.get(pdfurl)
+
+				a = linkobj(title, pdf.uri.to_s)
+				a[:source] = bgbluri
+				row[:law] << a
+			rescue
+				row[:law] << link_object(link)
+			end
+		else
+			row[:law] << link_object(link)
+		end
+	end
+
+	container = mp.at_css('.c-content-linklist__wrapper.row')
+	container.css('h3').each do |headline|
+		title = headline.text.strip
+		key = headline2key[title]
+		next if key.nil?
+
+		row[key] = []
+
+		links = headline.next_element.css('a')
+		links.each do |link|
+			row[key] << link_object(link)
+		end
+	end
+end
+
+# link buzer
+results.each do |row|
+	row[:law].each do |law|
+		buzer_uri = nil
+		if law[:url].include?('bgbl.de')
+			um = law[:url].match(/\/bgbl[12](\d+)s(\d+)_(?:\d+).pdf$/)
+			next if um.nil?
+			buzer_uri = "https://www.buzer.de/s1.htm?a=&g=20#{um[1]}+#{um[2]}"
+		else
+			buzer_uri = Addressable::URI.parse("https://www.buzer.de/s1.htm?a=&g=#{row[:title]}").normalize.to_s
+			next
+		end
+		next if buzer_uri.nil?
+		law[:buzer] = buzer_uri
+
+		page = m.get(buzer_uri)
+		link = page.at_css('div.g a[href$="l.htm"]')
+		next if link.nil?
+		law[:buzer_diff] = link.attributes['href'].to_s.strip
+	end
+end
+
+results.each do |row|
+	key = row[:title].downcase.gsub(/[\s.\/_]/, ' ').squeeze(' ').strip.gsub(/[^\w-]/, '').tr(' ', '-')
+	ScraperWiki.save_sqlite([:key], row.slice(:title, :overview).merge({key: key}))
+	ScraperWiki.save_sqlite([:key], row[:law].map { |o| o.merge({ key: key }) }, 'law') unless row[:law].nil?
+	ScraperWiki.save_sqlite([:key], row[:draft].map { |o| o.merge({ key: key }) }, 'draft') unless row[:draft].nil?
+	ScraperWiki.save_sqlite([:key], row[:statement].map { |o| o.merge({ key: key }) }, 'statement') unless row[:statement].nil?
+end