Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Download and import XLS, ODS, XML, CSV, etc. into your ActiveRecord models.

This branch is 209 commits behind seamusabshere:master

Fetching latest commit…

Cannot retrieve the latest commit at this time

Failed to load latest commit information.
lib
test
.document
.gitignore
CHANGELOG
Gemfile
Gemfile.lock
LICENSE
README.rdoc
Rakefile
VERSION
data_miner.gemspec

README.rdoc

data_miner

Programmatically import useful data into your ActiveRecord models.

(see wiki.github.com/seamusabshere/data_miner for more examples)

Quick start

You define data_miner blocks in your ActiveRecord models. For example, in app/models/country.rb:

class Country < ActiveRecord::Base
  set_primary_key :iso_3166_code

  data_miner do
    schema do
      string   'iso_3166_code'
      string   'name'
    end

    import 'the official ISO country list',
           :url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
           :skip => 2,
           :headers => false,
           :delimiter => ';',
           :encoding => 'ISO-8859-1' do
      key   'iso_3166_code', :field_number => 1
      store 'name', :field_number => 0
    end
  end
end

Now you can run:

irb(main):001:0> Country.run_data_miner!
=> nil

Advanced usage

This is how we linked together (data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:

class Aircraft < ActiveRecord::Base
  # Tell ActiveRecord that we want to use a string primary key.
  # This makes it easier to repeatedly truncate and re-import this
  # table without breaking associations.
  set_primary_key :icao_code

  # A dictionary between BTS aircraft type codes and ICAO aircraft
  # codes that uses string similarity instead of exact matching.
  # This is preferable to typing everything out.
  def self.bts_name_dictionary
    # Sorry for documenting the LooseTightDictionary gem here, but it's useful
    @_bts_dictionary ||= LooseTightDictionary.new(
                                                  # The first argument is the source... the possible matches. Most Enumerables will do.
                                                  RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
                                                  # Tightenings optionally pull out what is important on both sides of a potential match
                                                  :tightenings  => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
                                                  # Identities optionally require a particular capture from both sides of a match to be equal
                                                  :identities   => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
                                                  # Blockings restrict comparisons to a subset where everything matches the blocking
                                                  :blockings    => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
                                                  # This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
                                                  # This is useful because we say /boeing/ and only boeings are matched against other boeings.
                                                  :blocking_only => true,
                                                  # Tell the dictionary how read things from the source.
                                                  :right_reader => lambda { |record| record['Description'] }
                                                 )
  end

  # A dictionary between what appear to be ICAO aircraft names and
  # objects of this class itself.
  # Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
  def self.icao_name_dictionary
    @_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
                                                   :tightenings  => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
                                                   :identities   => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
                                                   :blockings    => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
                                                   :right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
  end

  # This responds to the "Matcher" interface as defined by DataMiner.
  # In other words, it takes Matcher#match(*args) and returns something.
  class BtsMatcher
    attr_reader :wants
    def initialize(wants)
      @wants = wants
    end
    def match(raw_faa_icao_record)
      @_match ||= Hash.new
      return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
      faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
      bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
      retval = case wants
      when :bts_aircraft_type_code
        bts_record['Code']
      when :bts_name
        bts_record['Description']
      end if bts_record
      @_match[raw_faa_icao_record] = retval
    end
  end

  # Another class that implements the "Matcher" interface as expected by DataMiner.
  class FuelUseMatcher
    def match(raw_fuel_use_record)
      @_match ||= Hash.new
      return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
      # First try assuming we have an ICAO code
      aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
        Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
      end
      # No luck? then try a fuzzy match
      aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
        Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
      end
      if aircraft_record
        @_match[raw_fuel_use_record] = aircraft_record.icao_code
      else
        # While we're developing the dictionary, we want it to blow up until we have 100% matchability
        raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
      end
    end
  end

  # This responds to the "Responder" interface as expected by Errata.
  # Basically it lets you say "Is a DC plane" in the errata file and
  # have it map to a Ruby method.
  class Guru
    def is_a_dc_plane?(row)
      row['Designator'] =~ /^DC\d/i
    end
    def is_a_g159?(row)
      row['Designator'] =~ /^G159$/
    end
    def is_a_galx?(row)
      row['Designator'] =~ /^GALX$/
    end
    def method_missing(method_id, *args, &block)
      if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
        manufacturer_name = $1
        manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
        matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
        method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
      else
        super
      end
    end
  end

  data_miner do
    # In our app, we defined DataMiner::Run.allowed? to return false if a run
    # has taken place in the last hour (among other things).
    # By raising DataMiner::Skip, we skip this run but call it a success.
    process "Don't re-import too often" do
      raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
    end

    # Define the database schema in-line.
    # It will destructively and automatically add/remove columns.
    # This is "OK" because you can always just re-run the import script to get the data back.
    # PS. if we were using DataMapper, we wouldn't need this.
    schema :options => 'ENGINE=InnoDB default charset=utf8' do
      string   'icao_code'
      string   'manufacturer_name'
      string   'name'
      string   'bts_name'
      string   'bts_aircraft_type_code'
      string   'brighter_planet_aircraft_class_code'
      string   'fuel_use_aircraft_name'
      float    'm3'
      string   'm3_units'
      float    'm2'
      string   'm2_units'
      float    'm1'
      string   'm1_units'
      float    'endpoint_fuel'
      string   'endpoint_fuel_units'
      float    'seats'
      float    'distance'
      string   'distance_units'
      float    'load_factor'
      float    'freight_share'
      float    'payload'
      float    'weighting'
      index    'bts_aircraft_type_code'
    end

    # The FAA publishes a document to help people identify aircraft by different names.
    ('A'..'Z').each do |letter|
      import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
              # The master URL of the source file (one for every letter)
              :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
              # The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
              :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
              # If it's not UTF-8, you should say what it is so that we can iconv it!
              :encoding => 'windows-1252',
              # Nokogiri is being used to grab each row starting from the second
              :row_xpath => '//table/tr[2]/td/table/tr',
              # ditto... XPath for Nokogiri
              :column_xpath => 'td' ) do
        # The code that they use is in fact the ICAO code!
        key 'icao_code', :field_name => 'Designator'
        # We get this for free
        store 'manufacturer_name', :field_name => 'Manufacturer'
        # ditto
        store 'name', :field_name => 'Model'
        # Use the loose-tight dictionary.
        # It gets the entire input row to play with before deciding on an output.
        store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
        store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
      end
    end

    # Pull in some data that might only be important to Brighter Planet
    import "Brighter Planet's aircraft class codes",
           :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
      key   'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
      store 'brighter_planet_aircraft_class_code'
    end

    # Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
    # This data comes from the EEA.
    import "pre-calculated fuel use equation coefficients",
           :url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
           :select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
      # We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
      key   'icao_code', :matcher => Aircraft::FuelUseMatcher.new
      # Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
      store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
      store 'm3'
      store 'm2'
      store 'm1'
      store 'endpoint_fuel', :field_name => 'b'
    end

    # Use arel and the weighted_average gem to do some crazy averaging.
    # This assumes that you're dealing with the BTS T-100 flight segment data.
    # See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
    process "Derive some average flight characteristics from flight segments" do
      FlightSegment.run_data_miner!
      aircraft = Aircraft.arel_table
      segments = FlightSegment.arel_table

      conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
      update_all "seats         = (#{FlightSegment.weighted_average_relation(:seats,         :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "distance      = (#{FlightSegment.weighted_average_relation(:distance,      :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "load_factor   = (#{FlightSegment.weighted_average_relation(:load_factor,   :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "payload       = (#{FlightSegment.weighted_average_relation(:payload,       :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"

      update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
    end

    # And finally re-run the import of resources that depend on this resource.
    # Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
    # that's the right way to do dependencies. It won't get called twice in the same run.
    [ AircraftManufacturer ].each do |synthetic_resource|
      process "Synthesize #{synthetic_resource}" do
        synthetic_resource.run_data_miner!
      end
    end
  end
end

Authors

  • Seamus Abshere <seamus@abshere.net>

  • Andy Rossmeissl <andy@rossmeissl.net>

Copyright

Copyright © 2010 Brighter Planet. See LICENSE for details.

Something went wrong with that request. Please try again.