Navigation Menu

Skip to content

Commit

Permalink
Use chupa-text gem
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Feb 18, 2014
1 parent e69c31b commit d5fa6f1
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 55 deletions.
6 changes: 4 additions & 2 deletions Gemfile
Expand Up @@ -40,9 +40,11 @@ end
# Use debugger
# gem 'debugger', group: [:development, :test]

gem 'glib2'
gem 'nokogiri'
gem 'chuparuby'
gem 'chupa-text'
gem 'chupa-text-decomposer-pdf'
gem 'chupa-text-decomposer-libreoffice'
gem 'chupa-text-decomposer-html'

base_dir = File.join(File.dirname(__FILE__), "..")
gem 'rroonga'
Expand Down
74 changes: 21 additions & 53 deletions lib/ranguba/indexer.rb
Expand Up @@ -3,7 +3,7 @@
require 'tmpdir'
require 'fileutils'
require 'time'
require 'chupatext'
require 'chupa-text'

class Ranguba::Indexer
attr_accessor :wget, :log_file, :url_prefix, :level, :accept,
Expand Down Expand Up @@ -135,6 +135,9 @@ def initialize(argv)
end

def prepare(args)
ChupaText::Decomposers.load
@extractor = ChupaText::Extractor.new
@extractor.apply_configuration(ChupaText::Configuration.default)
if @log_file and @url_prefix
raise OptionParser::InvalidOption, "--url-prefix and --from-log options are exclusive"
end
Expand Down Expand Up @@ -376,24 +379,19 @@ def decompose_file_in_sub_process(url, path, response)
def decompose_file_in_same_process(url, path, response)
data = nil
begin
input_data = Chupa::Data.new(path)
feeder = Chupa::Feeder.new
feeder.signal_connect("accepted") do |_feeder, _data|
data = _data
input_data = ChupaText::InputData.new(path)
@extractor.extract(input_data) do |extracted_data|
data = extracted_data
end
feeder.feed(input_data)
rescue Chupa::Error => e
rescue ChupaText::EncryptedError
nil
rescue ChupaText::Error => e
log(:error, "[error] #{e.class}: #{e.message}")
log(:error, "[error] path: #{path}")
case e.code
when Chupa::DecomposerErrorCode::ENCRYPTED
return nil
else
raise
end
else
return nil if data.nil?
decomposed_file = DecomposedFile.new(@resolver, url, path, response, data)
decomposed_file = DecomposedFile.new(@resolver, url, path, response,
input_data, data)
decomposed_file.attributes
end
end
Expand Down Expand Up @@ -449,30 +447,25 @@ def type_for_mime(source)
end

class DecomposedFile
include Loggable

def initialize(resolver, url, path, response, data)
def initialize(resolver, url, path, response, input_data, data)
@resolver = resolver
@url = url
@path = path
@response = response
@metadata = data.metadata
@body = data.read || ""
if @body.encoding == Encoding::ASCII_8BIT
@body.force_encoding(@metadata.encoding || Encoding::UTF_8)
end
@input_data = input_data
@data = data
end

def attributes
{
key: @url,
title: @metadata.title,
body: @body,
title: @data.attributes.title,
body: @data.body,
basename: @url.split(/\//).last,
type: normalize_type(@metadata.original_mime_type),
encoding: @response["charset"] || @metadata.original_encoding || "",
type: normalize_type(@input_data.mime_type),
encoding: @response["charset"] || @input_data.attributes.encoding.to_s,
category: category_for_url(@url) || "",
author: @metadata.author || "",
author: @data.attributes.author || "",
modified_at: modification_time,
updated_at: @response["x-update-time"],
}
Expand All @@ -481,7 +474,7 @@ def attributes
private
def modification_time
modification_time = @response["last-modified"]
modification_time ||= @metadata.modification_time
modification_time ||= @data.attributes.modified_time
if modification_time
begin
modification_time = Time.parse(modification_time)
Expand All @@ -500,30 +493,5 @@ def category_for_url(url)
def normalize_type(source)
@resolver.normalize_type(source) || "unknown"
end

def valid_encoding?(attributes)
url = attributes[:key]
invalid_encoding_attributes = attributes.reject do |key, value|
valid_utf8?(value)
end
invalid_encoding_keys = invalid_encoding_attributes.keys
if invalid_encoding_keys.blank?
true
else
message = "[#{invalid_encoding_keys.join(', ')}]"
log(:warn, "[encoding][invalid] key: #{url} - #{message}")
false
end
end

def valid_utf8?(value)
return true unless value.respond_to?(:encode)
value = value.dup
value.force_encoding("UTF-8").valid_encoding?
end

def log(level, message)
super(level, "[decompose]#{message}")
end
end
end

0 comments on commit d5fa6f1

Please sign in to comment.