Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
a lot of work on proper encoding support, tested in 1.8 and 1.9
  • Loading branch information
seamusabshere committed May 6, 2011
1 parent b95f1b2 commit 60c59e8
Show file tree
Hide file tree
Showing 28 changed files with 4,542 additions and 43 deletions.
1 change: 1 addition & 0 deletions .gitattributes
@@ -0,0 +1 @@
list-en1-semic-3* -crlf -diff -merge
5 changes: 5 additions & 0 deletions README.rdoc
Expand Up @@ -136,6 +136,11 @@ More examples:
[ 'spacer', 1 ],
[ 'header1', 10, { :type => :string } ]]

==Helpful hints

* ASCII-8BIT is the same as BINARY
* ISO-8859-1 is the same as Latin1

==Custom parsers

See the test file and also data_miner examples of custom parsers.
Expand Down
5 changes: 5 additions & 0 deletions lib/remote_table.rb
@@ -1,3 +1,8 @@
if ::RUBY_VERSION < '1.9' and $KCODE != 'UTF8'
$stderr.puts "[remote_table] Ruby 1.8 detected, setting $KCODE to UTF8 so that ActiveSupport::Multibyte works properly."
$KCODE = 'UTF8'
end

require 'active_support'
require 'active_support/version'
%w{
Expand Down
18 changes: 9 additions & 9 deletions lib/remote_table/format.rb
Expand Up @@ -24,17 +24,17 @@ def initialize(t)
@t = t
end

def utf8(str)
def recode_as_utf8(raw_str)
if ::RUBY_VERSION >= '1.9'
str.ensure_encoding 'UTF-8', :external_encoding => t.properties.encoding, :invalid_characters => :transcode
$stderr.puts "[remote_table] Raw - #{raw_str}" if ::ENV['REMOTE_TABLE_DEBUG'] == 'true'
recoded_str = raw_str.ensure_encoding 'UTF-8', :external_encoding => t.properties.encoding, :invalid_characters => :transcode
$stderr.puts "[remote_table] Recoded - #{recoded_str}" if ::ENV['REMOTE_TABLE_DEBUG'] == 'true'
recoded_str
else
return str if t.properties.encoding[0] =~ /utf.?8/i
begin
::Iconv.conv('UTF-8//TRANSLIT', t.properties.encoding[0], str.to_s + ' ')[0..-2]
rescue ::Iconv::IllegalSequence
$stderr.puts "[remote_table] Unable to transliterate #{str} into UTF-8 given #{t.properties.encoding[0]}"
str
end
$stderr.puts "[remote_table] Raw - #{raw_str}" if ::ENV['REMOTE_TABLE_DEBUG'] == 'true'
recoded_str = ::Iconv.conv('UTF-8//TRANSLIT', t.properties.encoding[0], raw_str.to_s + ' ')[0..-2]
$stderr.puts "[remote_table] Recoded - #{recoded_str}" if ::ENV['REMOTE_TABLE_DEBUG'] == 'true'
recoded_str
end
end

Expand Down
13 changes: 8 additions & 5 deletions lib/remote_table/format/delimited.rb
Expand Up @@ -17,19 +17,21 @@ class Delimited < Format
include Textual
def each(&blk)
remove_useless_characters!
fix_newlines!
skip_rows!
CSV.foreach(t.local_file.path, fastercsv_options) do |row|
if row.is_a?(CSV::Row)
output = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (key, value)|
hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (key, value)|
if key.present?
value = '' if value.nil?
memo[key] = utf8 value
memo[key] = recode_as_utf8 value
end
memo
end
yield output if t.properties.keep_blank_rows or output.any? { |k, v| v.present? }
else
yield row if t.properties.keep_blank_rows or row.any? { |v| v.present? }
yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
elsif row.is_a?(::Array)
array = row.map { |v| recode_as_utf8 v }
yield array if t.properties.keep_blank_rows or array.any? { |v| v.present? }
end
end
ensure
Expand All @@ -39,6 +41,7 @@ def each(&blk)
private

FASTERCSV_OPTIONS = %w{
encoding
unconverted_fields
col_sep
headers
Expand Down
15 changes: 10 additions & 5 deletions lib/remote_table/format/fixed_width.rb
@@ -1,17 +1,19 @@
require 'slither'
require 'fixed_width'

class RemoteTable
class Format
class FixedWidth < Format
include Textual
def each(&blk)
fix_newlines!
remove_useless_characters!
crop_rows!
skip_rows!
cut_columns!
parser.parse[:rows].each do |row|
row.reject! { |k, v| k.blank? }
row.each do |k, v|
row[k] = utf8 v
row[k] = recode_as_utf8 v.strip
end
yield row if t.properties.keep_blank_rows or row.any? { |k, v| v.present? }
end
Expand All @@ -22,16 +24,19 @@ def each(&blk)
private

def parser
@parser ||= ::Slither::Parser.new definition, t.local_file.path
if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
raise "[remote_table] You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
end
@parser ||= ::FixedWidth::Parser.new definition, t.local_file.io
end

def definition
@definition ||= if t.properties.schema_name.is_a?(::String) or t.properties.schema_name.is_a?(::Symbol)
::Slither.send :definition, t.properties.schema_name
::FixedWidth.send :definition, t.properties.schema_name
elsif t.properties.schema.is_a?(::Array)
everything = lambda { |_| true }
srand # in case this was forked by resque
::Slither.define(rand.to_s) do |d|
::FixedWidth.define(rand.to_s) do |d|
d.rows do |row|
row.trap(&everything)
t.properties.schema.each do |name, width, options|
Expand Down
3 changes: 2 additions & 1 deletion lib/remote_table/format/mixins/processed_by_nokogiri.rb
Expand Up @@ -4,6 +4,7 @@ class RemoteTable
class Format
module ProcessedByNokogiri
def each
raise "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML" unless t.properties.row_css or t.properties.row_xpath
remove_useless_characters!
first_row = true
keys = t.properties.headers if t.properties.headers.is_a?(::Array)
Expand Down Expand Up @@ -57,7 +58,7 @@ def zip(keys, values)

# should we be doing this in ruby?
def unescaped_xml_without_soft_hyphens
str = ::CGI.unescapeHTML utf8(::IO.read(t.local_file.path))
str = ::CGI.unescapeHTML recode_as_utf8(::IO.read(t.local_file.path))
# get rid of MS Office baddies
str.gsub! '&shy;', ''
str
Expand Down
14 changes: 5 additions & 9 deletions lib/remote_table/format/mixins/processed_by_roo.rb
Expand Up @@ -6,7 +6,7 @@ def each(&blk)
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
if t.properties.output_class == ::Array
(first_data_row..spreadsheet.last_row).each do |y|
(first_row..spreadsheet.last_row).each do |y|
output = (1..spreadsheet.last_column).map do |x|
spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
end
Expand All @@ -16,15 +16,15 @@ def each(&blk)
keys = {}
if t.properties.use_first_row_as_header?
(1..spreadsheet.last_column).each do |x|
keys[x] = spreadsheet.cell(header_row, x)
keys[x] = spreadsheet.cell(header_row - 1, x) if keys[x].blank? # look up
keys[x] = spreadsheet.cell(first_row, x)
keys[x] = spreadsheet.cell(first_row - 1, x) if keys[x].blank? # look up
end
else
(1..spreadsheet.last_column).each do |x|
keys[x] = t.properties.headers[x - 1]
end
end
(first_data_row..spreadsheet.last_row).each do |y|
(first_row+1..spreadsheet.last_row).each do |y|
output = (1..spreadsheet.last_column).inject(::ActiveSupport::OrderedHash.new) do |memo, x|
if keys[x].present?
memo[keys[x]] = spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
Expand All @@ -40,13 +40,9 @@ def each(&blk)

private

def header_row
def first_row
1 + t.properties.skip
end

def first_data_row
1 + header_row
end
end
end
end
10 changes: 8 additions & 2 deletions lib/remote_table/format/mixins/textual.rb
Expand Up @@ -6,11 +6,17 @@ module Textual
USELESS_CHARACTERS = [
'\xef\xbb\xbf', # UTF-8 byte order mark
'\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
'\xad',
# '\xa0'
]
def remove_useless_characters!
::RemoteTable.executor.bang t.local_file.path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
if t.properties.encoding[0] =~ /windows.?1252/i
# soft hyphen again, as I have seen it appear in windows 1252
::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\xad//g'}
end
end

def fix_newlines!
::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\r\n|\n|\r/\n/g'}
end

def skip_rows!
Expand Down
13 changes: 13 additions & 0 deletions lib/remote_table/local_file.rb
Expand Up @@ -15,7 +15,20 @@ def path
@path
end

def io
@io ||= if ::RUBY_VERSION >= '1.9'
::File.open path, 'r', :external_encoding => t.properties.encoding[0]
else
::File.open path, 'r'
end
end

def io_open?
defined? @io and @io.is_a?(::IO) and !@io.closed?
end

def delete
@io.close if io_open?
::FileUtils.rm_rf staging_dir_path
@path = nil
@staging_dir_path = nil
Expand Down
2 changes: 1 addition & 1 deletion lib/remote_table/properties.rb
Expand Up @@ -186,7 +186,7 @@ def schema
t.options['schema']
end

# The name of the fixed-width schema according to Slither
# The name of the fixed-width schema according to FixedWidth
def schema_name
t.options['schema_name']
end
Expand Down
6 changes: 2 additions & 4 deletions remote_table.gemspec
Expand Up @@ -21,7 +21,7 @@ Gem::Specification.new do |s|

s.add_dependency 'activesupport', '>=2.3.4'
s.add_dependency 'roo', '~>1.9'
s.add_dependency 'slither', '>=0.99.4'
s.add_dependency 'fixed_width'
s.add_dependency 'i18n' # activesupport?
s.add_dependency 'builder' # roo?
s.add_dependency 'zip' # roo
Expand All @@ -31,9 +31,7 @@ Gem::Specification.new do |s|
s.add_dependency 'escape', '>=0.0.4'
s.add_dependency 'posix-spawn'
s.add_dependency 'ensure-encoding'
unless RUBY_VERSION >= '1.9'
s.add_dependency 'fastercsv', '>=1.5.0'
end
s.add_dependency 'fastercsv', '>=1.5.0'

s.add_development_dependency 'errata', '>=0.2.0'
s.add_development_dependency 'test-unit'
Expand Down
1 change: 0 additions & 1 deletion test/helper.rb
Expand Up @@ -4,7 +4,6 @@
require 'test/unit'
require 'shoulda'
require 'ruby-debug'
require 'tempfile'

$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
$LOAD_PATH.unshift(File.dirname(__FILE__))
Expand Down
Binary file not shown.

0 comments on commit 60c59e8

Please sign in to comment.