-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reworking this into a basic ETL tool.
Specific parsers are now subclasses of a basic parser. In doing this I was prompted to take a look again at ActiveWarehouse. It seems like that tool has come a long way so perhaps I'll suspend this home-grown path and look into just piggybacking on that tool.
- Loading branch information
Showing
8 changed files
with
18,392 additions
and
138 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
require 'rubygems' | ||
require 'nokogiri' | ||
require 'fastercsv' | ||
require 'open-uri' | ||
|
||
class Parser | ||
attr_reader :url | ||
|
||
def initialize(options) | ||
@verbose = options[:verbose] | ||
@url = options[:url] | ||
@cache = options[:cache] | ||
end | ||
|
||
def verbose? | ||
!!@verbose | ||
end | ||
|
||
def document | ||
@document ||= fetch | ||
end | ||
|
||
def process! | ||
upload!(to_output) | ||
end | ||
|
||
def upload!(data) | ||
STDERR.puts "Would be uploading..." | ||
puts data | ||
end | ||
|
||
def to_output | ||
serialize(convert(from_input)) | ||
end | ||
|
||
def from_input | ||
deserialize(document) | ||
end | ||
|
||
# fetch document from url, falling back to cache in case of failure to read, updating cache if successful | ||
def fetch | ||
save(read(url)) | ||
rescue Exception => e | ||
STDERR.puts "WARNING: using cached file due to error fetching [#{url}]: #{e.to_s}" rescue nil | ||
cached | ||
end | ||
|
||
def read(url) | ||
STDERR.puts "Reading data from [#{url}]" if verbose? | ||
result = open(url) {|f| return f.read } | ||
STDERR.puts "Finished reading [#{url}]" if verbose? | ||
result | ||
end | ||
|
||
# update cache of remote file contents; return contents when finished | ||
def save(contents) | ||
STDERR.puts "Caching data in [#{cache_file}]" if verbose? | ||
File.open(cache_file, 'w') {|f| f.puts contents } | ||
STDERR.puts "Finished updating cache." if verbose? | ||
rescue Exception => e | ||
STDERR.puts "WARNING: Unable to update cache file [#{cache_file}] for contents of [#{url}]: #{e.to_s}" | ||
ensure | ||
return contents | ||
end | ||
|
||
def cached | ||
STDERR.puts "Reading data from cache file [#{cache_file}]" if verbose? | ||
result = File.read(cache_file) | ||
STDERR.puts "Finished reading cache." if verbose? | ||
result | ||
end | ||
|
||
def cache | ||
@cache ||= File.expand_path(File.join(File.dirname(__FILE__), *%w[.. cache])) | ||
end | ||
|
||
def filename | ||
File.basename(URI.parse(url).path) | ||
end | ||
|
||
def cache_file | ||
File.join(cache, filename) | ||
end | ||
|
||
def from_xml(doc) | ||
STDERR.puts "Parsing XML..." if verbose? | ||
result = Nokogiri::XML::parse(doc) | ||
STDERR.puts "Finished parsing." if verbose? | ||
result | ||
end | ||
|
||
def mapping | ||
@mapping ||= conversion_map.inject({}) {|h, pair| h[pair.first] = pair.last; h } | ||
end | ||
|
||
def destinations | ||
@destinations ||= conversion_map.collect {|rule| rule.first }.flatten | ||
end | ||
|
||
def convert(list) | ||
STDERR.puts "Converting data..." if verbose? | ||
result = list.inject([]) {|records, row| records << convert_row(row) } | ||
STDERR.puts "Finished conversion." if verbose? | ||
result | ||
end | ||
|
||
def convert_row(row) | ||
destinations.inject({}) do |result, dest| | ||
mapping.each_pair {|name, converter| result[name] = change(converter, row) } | ||
result | ||
end | ||
end | ||
|
||
def change(rule, row) | ||
return rule.call(row) if rule.respond_to?(:call) | ||
row[rule] | ||
end | ||
|
||
def record_to_array(record) | ||
destinations.inject([]) {|l, name| l << record[name] } | ||
end | ||
|
||
# per-class method - mapping: ordered list of pairs -- first is destination name, last is conversion on source record | ||
# a simple string denotes a source key, proc denotes a method on source record | ||
def conversion_map | ||
raise NotImplementedError | ||
end | ||
|
||
# per-class method | ||
def deserialize(doc) | ||
raise NotImplementedError | ||
end | ||
|
||
# per-class method | ||
def serialize(data) | ||
raise NotImplementedError | ||
end | ||
|
||
# per-class method | ||
def valid?(parsed) | ||
raise NotImplementedError | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
require 'rubygems' | ||
require 'nokogiri' | ||
|
||
require File.expand_path(File.join(File.dirname(__FILE__), *%w[spec_helper])) | ||
require 'parser_behavior' | ||
require 'parser' | ||
|
||
describe 'Parser' do | ||
before do | ||
@class = Parser | ||
@parser = Parser.new({}) | ||
end | ||
|
||
behaves_like 'a parser' | ||
|
||
describe 'when looking up a conversion map from input fields to output fields' do | ||
# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct. | ||
|
||
it 'should not allow arguments' do | ||
lambda { @parser.conversion_map(:foo) }.should.raise(ArgumentError) | ||
end | ||
|
||
it 'should require subclasses to implement this functionality' do | ||
lambda { @parser.conversion_map }.should.raise(NotImplementedError) | ||
end | ||
end | ||
|
||
describe 'when deserializing input data' do | ||
before do | ||
@document = 'Test Document' | ||
end | ||
|
||
# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct. | ||
|
||
it 'should require a document' do | ||
lambda { @parser.deserialize }.should.raise(ArgumentError) | ||
end | ||
|
||
it 'should require subclasses to implement this functionality' do | ||
lambda { @parser.deserialize(@document) }.should.raise(NotImplementedError) | ||
end | ||
end | ||
|
||
describe 'when serializing output data' do | ||
before do | ||
@data = 'Test Data' | ||
end | ||
|
||
# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct. | ||
|
||
it 'should require a data set' do | ||
lambda { @parser.serialize }.should.raise(ArgumentError) | ||
end | ||
|
||
it 'should require subclasses to implement this functionality' do | ||
lambda { @parser.serialize(@data) }.should.raise(NotImplementedError) | ||
end | ||
end | ||
|
||
describe 'when checking the validity of an input document' do | ||
before do | ||
@document = 'Test Document' | ||
end | ||
|
||
# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct. | ||
|
||
it 'should require a document' do | ||
lambda { @parser.valid? }.should.raise(ArgumentError) | ||
end | ||
|
||
it 'should require subclasses to implement this functionality' do | ||
lambda { @parser.valid?(@document) }.should.raise(NotImplementedError) | ||
end | ||
end | ||
end |
Oops, something went wrong.