From 6663dbb8fa2a3beb6c7c12067de417664c94d885 Mon Sep 17 00:00:00 2001 From: Maximilian Richt Date: Tue, 5 Dec 2017 09:50:57 +0100 Subject: [PATCH] add first morph-based version --- Gemfile | 8 ++- Gemfile.lock | 48 ++++++++++---- scraper.rb | 176 +++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 191 insertions(+), 41 deletions(-) diff --git a/Gemfile b/Gemfile index 6ab45dc..7f899bc 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,9 @@ source "https://rubygems.org" -ruby "2.0.0" +ruby '2.4.2' -gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" -gem "mechanize" +gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults' +gem 'mechanize', '~> 2.7.5' +gem 'addressable', '~> 2.5.2' +gem 'activesupport', '~> 5.1.4' diff --git a/Gemfile.lock b/Gemfile.lock index 30fb5f3..76c7da0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -10,38 +10,60 @@ GIT GEM remote: https://rubygems.org/ specs: - domain_name (0.5.24) + activesupport (5.1.4) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (~> 0.7) + minitest (~> 5.1) + tzinfo (~> 1.1) + addressable (2.5.2) + public_suffix (>= 2.0.2, < 4.0) + concurrent-ruby (1.0.5) + domain_name (0.5.20170404) unf (>= 0.0.5, < 1.0.0) - http-cookie (1.0.2) + http-cookie (1.0.3) domain_name (~> 0.5) httpclient (2.6.0.1) - mechanize (2.7.3) + i18n (0.9.1) + concurrent-ruby (~> 1.0) + mechanize (2.7.5) domain_name (~> 0.5, >= 0.5.1) http-cookie (~> 1.0) - mime-types (~> 2.0) + mime-types (>= 1.17.2) net-http-digest_auth (~> 1.1, >= 1.1.1) net-http-persistent (~> 2.5, >= 2.5.2) - nokogiri (~> 1.4) + nokogiri (~> 1.6) ntlm-http (~> 0.1, >= 0.1.1) webrobots (>= 0.0.9, < 0.2) - mime-types (2.5) - mini_portile (0.6.2) - net-http-digest_auth (1.4) + mime-types (3.1) + mime-types-data (~> 3.2015) + mime-types-data (3.2016.0521) + mini_portile2 (2.3.0) + minitest (5.10.3) + net-http-digest_auth (1.4.1) net-http-persistent (2.9.4) - nokogiri (1.6.6.2) - mini_portile (~> 0.6.0) + nokogiri (1.8.1) + mini_portile2 (~> 2.3.0) ntlm-http (0.1.1) + public_suffix (3.0.1) sqlite3 (1.3.10) sqlite_magic (0.0.3) sqlite3 + thread_safe (0.3.6) + tzinfo (1.2.4) + thread_safe (~> 0.1) unf (0.1.4) unf_ext - unf_ext (0.0.7.1) - webrobots (0.1.1) + unf_ext (0.0.7.4) + webrobots (0.1.2) PLATFORMS ruby DEPENDENCIES - mechanize + activesupport (~> 5.1.4) + addressable (~> 2.5.2) + mechanize (~> 2.7.5) scraperwiki! + +BUNDLED WITH + 1.16.0 diff --git a/scraper.rb b/scraper.rb index 5799e98..1443071 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,25 +1,151 @@ -# This is a template for a Ruby scraper on morph.io (https://morph.io) -# including some code snippets below that you should find helpful - -# require 'scraperwiki' -# require 'mechanize' -# -# agent = Mechanize.new -# -# # Read in a page -# page = agent.get("http://foo.com") -# -# # Find somehing on the page using css selectors -# p page.at('div.content') -# -# # Write out to the sqlite database using scraperwiki library -# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) -# -# # An arbitrary query against the database -# ScraperWiki.select("* from data where 'name'='peter'") - -# You don't have to do things with the Mechanize or ScraperWiki libraries. -# You can use whatever gems you want: https://morph.io/documentation/ruby -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +require 'scraperwiki' +require 'mechanize' +require 'addressable/uri' +require 'date' +require 'json' +require 'digest/sha1' +require 'active_support/core_ext/hash/slice' + +LIST_URL = 'https://www.bmi.bund.de/SiteGlobals/Forms/suche/gesetzgebungsverfahren-formular.html' + +m = Mechanize.new +mp = m.get(LIST_URL) + +BASE = mp.bases.first.href + +results = [] + +loop do + result_items = mp.css('.c-search-teaser.Law') + result_items.each do |row| + link = row.css('.c-search-teaser__h a').first + title = link.text.strip + path = link.attributes['href'] + url = Addressable::URI.join(BASE, path).normalize.to_s + + x = { + title: title, + overview: url + } + results << x + end + + link = mp.at_css('.navIndex .forward a') + break if link.nil? + + path = link.attributes['href'] + url = Addressable::URI.join(BASE, path).normalize.to_s + + mp = m.get url +end + +# Phase 2: scrape detail pages +headline2key = { + 'Referentenentwurf' => :draft, + 'Verbandsstellungnahme' => :statement, + 'Verbändestellungnahmen' => :statement, + 'Stellungnahmen' => :statement +} + +def link_object(link) + title = link.text.strip + path = link.attributes['href'].to_s.strip + linkobj(title, Addressable::URI.join(BASE, path).normalize.to_s) +end + +def linkobj(title, uri) + uri = Addressable::URI.parse(uri).normalize + url = uri.to_s + + filename = "#{Digest::SHA256.hexdigest(url)[0...8]}_#{uri.basename.to_s}" + + { + title: title, + url: url, + filename: filename + } +end + +results.each do |row| + mp = m.get row[:overview] + + headline = mp.at_css('.c-content-stage__headline') + row[:title] = headline.text.strip + row[:law] = [] + + linked = mp.css('.c-more__link') + linked.each do |link| + path = link.attributes['href'].to_s.strip + if path.include?('bgbl.de') + begin + title = link.text.strip + uri = Addressable::URI.join(path).normalize + query = uri.normalized_query + if query.include?('&jumpTo=') + jumpTo = query.match(/jumpTo=(.+?)(?:&|$)/)[1] + start = "%2F%2F%2A%5B%40attr_id%3D%27#{jumpTo}%27%5D" + else + start = query.match(/start=(.+?)(?:&|$)/)[1] + end + bgbluri = Addressable::URI.parse("https://www.bgbl.de/xaver/bgbl/text.xav?skin=pdf&start=#{start}").normalize.to_s + m.get(uri) # unlock session + bgblpage = m.get(bgbluri) + fakeimg = bgblpage.at_css('.xaver-PDF img') + pdfurl = fakeimg.attributes['src'] + pdf = m.get(pdfurl) + + a = linkobj(title, pdf.uri.to_s) + a[:source] = bgbluri + row[:law] << a + rescue + row[:law] << link_object(link) + end + else + row[:law] << link_object(link) + end + end + + container = mp.at_css('.c-content-linklist__wrapper.row') + container.css('h3').each do |headline| + title = headline.text.strip + key = headline2key[title] + next if key.nil? + + row[key] = [] + + links = headline.next_element.css('a') + links.each do |link| + row[key] << link_object(link) + end + end +end + +# link buzer +results.each do |row| + row[:law].each do |law| + buzer_uri = nil + if law[:url].include?('bgbl.de') + um = law[:url].match(/\/bgbl[12](\d+)s(\d+)_(?:\d+).pdf$/) + next if um.nil? + buzer_uri = "https://www.buzer.de/s1.htm?a=&g=20#{um[1]}+#{um[2]}" + else + buzer_uri = Addressable::URI.parse("https://www.buzer.de/s1.htm?a=&g=#{row[:title]}").normalize.to_s + next + end + next if buzer_uri.nil? + law[:buzer] = buzer_uri + + page = m.get(buzer_uri) + link = page.at_css('div.g a[href$="l.htm"]') + next if link.nil? + law[:buzer_diff] = link.attributes['href'].to_s.strip + end +end + +results.each do |row| + key = row[:title].downcase.gsub(/[\s.\/_]/, ' ').squeeze(' ').strip.gsub(/[^\w-]/, '').tr(' ', '-') + ScraperWiki.save_sqlite([:key], row.slice(:title, :overview).merge({key: key})) + ScraperWiki.save_sqlite([:key], row[:law].map { |o| o.merge({ key: key }) }, 'law') unless row[:law].nil? + ScraperWiki.save_sqlite([:key], row[:draft].map { |o| o.merge({ key: key }) }, 'draft') unless row[:draft].nil? + ScraperWiki.save_sqlite([:key], row[:statement].map { |o| o.merge({ key: key }) }, 'statement') unless row[:statement].nil? +end \ No newline at end of file