Skip to content

Commit

Permalink
Add open-document-text decomposer
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Feb 26, 2019
1 parent a44d23b commit a3b273e
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 147 deletions.
54 changes: 12 additions & 42 deletions lib/chupa-text/decomposers/open-document-presentation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,58 +27,28 @@ def initialize(options={})
@mime_type = "application/vnd.oasis.opendocument.presentation"
end

def target?(data)
data.extension == @extension or
data.mime_type == @mime_type
private
def process_content(entry, context, &block)
context[:slides] = []
listener = SlidesListener.new(context[:slides])
parse(entry.file_data, listener)
end

def target_score(data)
if target?(data)
-1
else
nil
def finish_decompose(context, &block)
metadata = TextData.new("", source_data: context[:data])
context[:attributes].each do |name, value|
metadata[name] = value
end
end
yield(metadata)

def decompose(data)
slides = []
data.open do |input|
Archive::Zip.open(input) do |zip|
zip.each do |entry|
next unless entry.file?
case entry.zip_path
when "content.xml"
listener = SlidesListener.new(slides)
parse(entry.file_data, listener)
when "meta.xml"
attributes = {}
listener = AttributesListener.new(attributes)
parse(entry.file_data, listener)
metadata = TextData.new("", source_data: data)
attributes.each do |name, value|
metadata[name] = value
end
yield(metadata)
end
end
end
end
slides.each_with_index do |slide, i|
(context[:slides] || []).each_with_index do |slide, i|
text = slide[:text]
text_data = TextData.new(text, source_data: data)
text_data = TextData.new(text, source_data: context[:data])
text_data["index"] = i
yield(text_data)
end
end

private
def parse(io, listener)
source = REXML::Source.new(io.read)
parser = REXML::Parsers::SAX2Parser.new(source)
parser.listen(listener)
parser.parse
end

class SlidesListener
include REXML::SAX2Listener

Expand Down
54 changes: 12 additions & 42 deletions lib/chupa-text/decomposers/open-document-spreadsheet.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,60 +27,30 @@ def initialize(options={})
@mime_type = "application/vnd.oasis.opendocument.spreadsheet"
end

def target?(data)
data.extension == @extension or
data.mime_type == @mime_type
private
def process_content(entry, context, &block)
context[:sheets] = []
listener = SheetsListener.new(context[:sheets])
parse(entry.file_data, listener)
end

def target_score(data)
if target?(data)
-1
else
nil
def finish_decompose(context, &block)
metadata = TextData.new("", source_data: context[:data])
context[:attributes].each do |name, value|
metadata[name] = value
end
end
yield(metadata)

def decompose(data)
sheets = []
data.open do |input|
Archive::Zip.open(input) do |zip|
zip.each do |entry|
next unless entry.file?
case entry.zip_path
when "content.xml"
listener = SheetsListener.new(sheets)
parse(entry.file_data, listener)
when "meta.xml"
attributes = {}
listener = AttributesListener.new(attributes)
parse(entry.file_data, listener)
metadata = TextData.new("", source_data: data)
attributes.each do |name, value|
metadata[name] = value
end
yield(metadata)
end
end
end
end
sheets.each_with_index do |sheet, i|
(context[:sheets] || []).each_with_index do |sheet, i|
text = sheet[:text]
text_data = TextData.new(text, source_data: data)
text_data = TextData.new(text, source_data: context[:data])
text_data["index"] = i
name = sheet[:name]
text_data["name"] = name if name
yield(text_data)
end
end

private
def parse(io, listener)
source = REXML::Source.new(io.read)
parser = REXML::Parsers::SAX2Parser.new(source)
parser.listen(listener)
parser.parse
end

class SheetsListener
include REXML::SAX2Listener

Expand Down
89 changes: 89 additions & 0 deletions lib/chupa-text/decomposers/open-document-text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

require "chupa-text/decomposers/open-document"

module ChupaText
module Decomposers
class OpenDocumentText < OpenDocument
registry.register("open-document-text", self)

def initialize(options={})
super
@extension = "odt"
@mime_type = "application/vnd.oasis.opendocument.text"
end

private
def process_content(entry, context, &block)
context[:text] = ""
listener = TextListener.new(context[:text])
parse(entry.file_data, listener)
end

def finish_decompose(context, &block)
text_data = TextData.new(context[:text] || "",
source_data: context[:data])
context[:attributes].each do |name, value|
text_data[name] = value
end
yield(text_data)
end

class TextListener
include REXML::SAX2Listener

TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
def initialize(output)
@output = output
@in_p = false
end

def start_element(uri, local_name, qname, attributes)
return unless uri == TEXT_URI
case local_name
when "p"
@in_p = true
end
end

def end_element(uri, local_name, qname)
@in_p = false

return unless uri == TEXT_URI
case local_name
when "p"
@output << "\n"
end
end

def characters(text)
add_text(text)
end

def cdata(content)
add_text(content)
end

private
def add_text(text)
return unless @in_p
@output << CGI.unescapeHTML(text)
end
end
end
end
end
72 changes: 10 additions & 62 deletions lib/chupa-text/decomposers/open-document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,9 @@
module ChupaText
module Decomposers
class OpenDocument < Decomposer
registry.register("open-document", self)

EXTENSIONS = [
"odt",
]
MIME_TYPES = [
"application/vnd.oasis.opendocument.text",
]
def target?(data)
EXTENSIONS.include?(data.extension) or
MIME_TYPES.include?(data.mime_type)
data.extension == @extension or
data.mime_type == @mime_type
end

def target_score(data)
Expand All @@ -44,9 +36,9 @@ def target_score(data)
end
end

def decompose(data)
def decompose(data, &block)
context = {
text: "",
data: data,
attributes: {},
}
data.open do |input|
Expand All @@ -55,21 +47,14 @@ def decompose(data)
next unless entry.file?
case entry.zip_path
when "content.xml"
listener = TextListener.new(context[:text])
parse(entry.file_data, listener)
process_content(entry, context, &block)
when "meta.xml"
listener = AttributesListener.new(context[:attributes])
parse(entry.file_data, listener)
process_meta(entry, context, &block)
end
end
end
end
text = context[:text]
text_data = TextData.new(text, source_data: data)
context[:attributes].each do |name, value|
text_data[name] = value
end
yield(text_data)
finish_decompose(context, &block)
end

private
Expand All @@ -80,46 +65,9 @@ def parse(io, listener)
parser.parse
end

class TextListener
include REXML::SAX2Listener

TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
def initialize(output)
@output = output
@in_p = false
end

def start_element(uri, local_name, qname, attributes)
return unless uri == TEXT_URI
case local_name
when "p"
@in_p = true
end
end

def end_element(uri, local_name, qname)
@in_p = false

return unless uri == TEXT_URI
case local_name
when "p"
@output << "\n"
end
end

def characters(text)
add_text(text)
end

def cdata(content)
add_text(content)
end

private
def add_text(text)
return unless @in_p
@output << CGI.unescapeHTML(text)
end
def process_meta(entry, context, &block)
listener = AttributesListener.new(context[:attributes])
parse(entry.file_data, listener)
end

class AttributesListener
Expand Down
2 changes: 1 addition & 1 deletion test/decomposers/test-open-document-text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentText < Test::Unit::TestCase
include Helper

def setup
@decomposer = ChupaText::Decomposers::OpenDocument.new({})
@decomposer = ChupaText::Decomposers::OpenDocumentText.new({})
end

def decompose(path)
Expand Down

0 comments on commit a3b273e

Please sign in to comment.