Skip to content

Commit

Permalink
Reworking this into a basic ETL tool.
Browse files Browse the repository at this point in the history
Specific parsers are now subclasses of a basic parser.  In doing this I was prompted
to take a look again at ActiveWarehouse.  It seems like that tool has come a long way
so perhaps I'll suspend this home-grown path and look into just piggybacking on that
tool.
  • Loading branch information
rick committed Oct 10, 2009
1 parent db1a151 commit e93e1a0
Show file tree
Hide file tree
Showing 8 changed files with 18,392 additions and 138 deletions.
4 changes: 4 additions & 0 deletions bin/process_yahoo_xml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

options = {}
OptionParser.new do |opts|
opts.on('-u URL', '--url URL', 'specify URL from which to fetch data') do |u|
options[:url] = u
end

opts.on('-v', '--verbose', 'verbosely display progress and errors') do |v|
options[:verbose] = true
end
Expand Down
17,634 changes: 17,634 additions & 0 deletions cache/catalog.xml

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions lib/parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
require 'rubygems'
require 'nokogiri'
require 'fastercsv'
require 'open-uri'

class Parser
attr_reader :url

def initialize(options)
@verbose = options[:verbose]
@url = options[:url]
@cache = options[:cache]
end

def verbose?
!!@verbose
end

def document
@document ||= fetch
end

def process!
upload!(to_output)
end

def upload!(data)
STDERR.puts "Would be uploading..."
puts data
end

def to_output
serialize(convert(from_input))
end

def from_input
deserialize(document)
end

# fetch document from url, falling back to cache in case of failure to read, updating cache if successful
def fetch
save(read(url))
rescue Exception => e
STDERR.puts "WARNING: using cached file due to error fetching [#{url}]: #{e.to_s}" rescue nil
cached
end

def read(url)
STDERR.puts "Reading data from [#{url}]" if verbose?
result = open(url) {|f| return f.read }
STDERR.puts "Finished reading [#{url}]" if verbose?
result
end

# update cache of remote file contents; return contents when finished
def save(contents)
STDERR.puts "Caching data in [#{cache_file}]" if verbose?
File.open(cache_file, 'w') {|f| f.puts contents }
STDERR.puts "Finished updating cache." if verbose?
rescue Exception => e
STDERR.puts "WARNING: Unable to update cache file [#{cache_file}] for contents of [#{url}]: #{e.to_s}"
ensure
return contents
end

def cached
STDERR.puts "Reading data from cache file [#{cache_file}]" if verbose?
result = File.read(cache_file)
STDERR.puts "Finished reading cache." if verbose?
result
end

def cache
@cache ||= File.expand_path(File.join(File.dirname(__FILE__), *%w[.. cache]))
end

def filename
File.basename(URI.parse(url).path)
end

def cache_file
File.join(cache, filename)
end

def from_xml(doc)
STDERR.puts "Parsing XML..." if verbose?
result = Nokogiri::XML::parse(doc)
STDERR.puts "Finished parsing." if verbose?
result
end

def mapping
@mapping ||= conversion_map.inject({}) {|h, pair| h[pair.first] = pair.last; h }
end

def destinations
@destinations ||= conversion_map.collect {|rule| rule.first }.flatten
end

def convert(list)
STDERR.puts "Converting data..." if verbose?
result = list.inject([]) {|records, row| records << convert_row(row) }
STDERR.puts "Finished conversion." if verbose?
result
end

def convert_row(row)
destinations.inject({}) do |result, dest|
mapping.each_pair {|name, converter| result[name] = change(converter, row) }
result
end
end

def change(rule, row)
return rule.call(row) if rule.respond_to?(:call)
row[rule]
end

def record_to_array(record)
destinations.inject([]) {|l, name| l << record[name] }
end

# per-class method - mapping: ordered list of pairs -- first is destination name, last is conversion on source record
# a simple string denotes a source key, proc denotes a method on source record
def conversion_map
raise NotImplementedError
end

# per-class method
def deserialize(doc)
raise NotImplementedError
end

# per-class method
def serialize(data)
raise NotImplementedError
end

# per-class method
def valid?(parsed)
raise NotImplementedError
end
end
49 changes: 29 additions & 20 deletions lib/yahoo_xml_parser.rb
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
require 'nokogiri'
require 'parser'

class YahooXMLParser
def initialize(options)
@verbose = options[:verbose]
end

def verbose?
!!@verbose
class YahooXMLParser < Parser

# per-class method - mapping: ordered list of pairs -- first is destination name, last is conversion on source record
# a simple string denotes a source key, proc denotes a method on source record
def conversion_map
@conversion_map ||=
[
[ 'Name', 'name' ],
[ 'Discount', Proc.new {|source| "%0.2f" % (source['price'].to_f - source['sale-price'].to_f) } ]
]
end

def process!
upload!(transform_xml)
# per-class method
def deserialize(doc)
parsed = from_xml(doc)
STDERR.puts "Validating document..." if verbose?
raise "document is not valid:\n [#{document}]" unless valid?(parsed)
STDERR.puts "Finished document validation." if verbose?
result = parsed.xpath('/Catalog/Item').inject([]) {|a,i| h= {}; a << h; i.xpath('ItemField').each{|f| h[f['TableFieldID']] = f['Value'] }; a }
result
end

def transform_xml
doc = parsed_xml
raise "XML document is not valid:\n [#{doc}]" unless valid_xml?(doc)
# per-class method
# TODO: lift up to #to_csv in base class
def serialize(data)
FasterCSV.generate(:write_headers => true, :headers => destinations) do |csv|
data.each {|record| csv << record_to_array(record) }
end
end

def valid_xml?(parsed)

# per-class method
def valid?(parsed)
return false if parsed.xpath('/Catalog').empty?
return false unless parsed.xpath('/Catalog').first['StoreID']
return false unless parsed.xpath('/Catalog').first['StoreName']
Expand All @@ -39,13 +52,9 @@ def valid_xml?(parsed)

# require values for specific fields
['name', 'taxable', 'code', 'need-ship', 'condition'].each do |field|
return false if item_has_no_value_for?(items, field)
return false if items.xpath("ItemField[@TableFieldID='#{field}']").first['Value'] == ''
end

true
end

def item_has_no_value_for?(items, name)
items.xpath("ItemField[@TableFieldID='#{name}']").first['Value'] == ''
end
end
75 changes: 75 additions & 0 deletions spec/parser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
require 'rubygems'
require 'nokogiri'

require File.expand_path(File.join(File.dirname(__FILE__), *%w[spec_helper]))
require 'parser_behavior'
require 'parser'

describe 'Parser' do
before do
@class = Parser
@parser = Parser.new({})
end

behaves_like 'a parser'

describe 'when looking up a conversion map from input fields to output fields' do
# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct.

it 'should not allow arguments' do
lambda { @parser.conversion_map(:foo) }.should.raise(ArgumentError)
end

it 'should require subclasses to implement this functionality' do
lambda { @parser.conversion_map }.should.raise(NotImplementedError)
end
end

describe 'when deserializing input data' do
before do
@document = 'Test Document'
end

# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct.

it 'should require a document' do
lambda { @parser.deserialize }.should.raise(ArgumentError)
end

it 'should require subclasses to implement this functionality' do
lambda { @parser.deserialize(@document) }.should.raise(NotImplementedError)
end
end

describe 'when serializing output data' do
before do
@data = 'Test Data'
end

# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct.

it 'should require a data set' do
lambda { @parser.serialize }.should.raise(ArgumentError)
end

it 'should require subclasses to implement this functionality' do
lambda { @parser.serialize(@data) }.should.raise(NotImplementedError)
end
end

describe 'when checking the validity of an input document' do
before do
@document = 'Test Document'
end

# Note: NotImplementedError doesn't seem to be caught by the normal lambda {} construct.

it 'should require a document' do
lambda { @parser.valid? }.should.raise(ArgumentError)
end

it 'should require subclasses to implement this functionality' do
lambda { @parser.valid?(@document) }.should.raise(NotImplementedError)
end
end
end
Loading

0 comments on commit e93e1a0

Please sign in to comment.