Skip to content

Commit

Permalink
Add body size limitation
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Mar 2, 2019
1 parent a1ff607 commit 8b997e6
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 32 deletions.
10 changes: 8 additions & 2 deletions lib/chupa-text/command/chupa-text.rb
@@ -1,4 +1,4 @@
# Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
Expand Down Expand Up @@ -49,6 +49,7 @@ def initialize
@mime_formatter_options = {}
@need_screenshot = true
@expected_screenshot_size = [200, 200]
@max_body_size = nil
end

def run(*arguments)
Expand Down Expand Up @@ -143,6 +144,11 @@ def create_option_parser
"(default: #{@expected_screenshot_size.join("x")})") do |size|
@expected_screenshot_size = size
end
parser.on("--max-body-size=BYTE", Integer,
"The max byte of extracted body.",
"(default: no limit)") do |size|
@max_body_size = size
end

parser.separator("")
parser.separator("Log related options:")
Expand Down Expand Up @@ -190,7 +196,7 @@ def load_decomposers
end

def create_extractor
extractor = Extractor.new
extractor = Extractor.new(max_body_size: @max_body_size)
extractor.apply_configuration(@configuration)
extractor
end
Expand Down
14 changes: 11 additions & 3 deletions lib/chupa-text/data.rb
Expand Up @@ -198,12 +198,20 @@ def need_screenshot?
@need_screenshot
end

def to_utf8_body_data
b = body
def to_utf8_body_data(max_body_size: nil)
b = nil
if max_body_size
open do |input|
b = input.read(max_body_size)
end
else
b = body
end
return self if b.nil?

converter = UTF8Converter.new(b)
utf8_body = converter.convert
if b.equal?(utf8_body)
if max_body_size.nil? and b.equal?(utf8_body)
self
else
TextData.new(utf8_body, source_data: self)
Expand Down
7 changes: 4 additions & 3 deletions lib/chupa-text/extractor.rb
Expand Up @@ -21,8 +21,9 @@ module ChupaText
class Extractor
include Loggable

def initialize
def initialize(max_body_size: nil)
@decomposers = []
@max_body_size = max_body_size
end

# Sets the extractor up by the configuration. It adds decomposers
Expand Down Expand Up @@ -90,11 +91,11 @@ def extract_recursive(target, &block)
if decomposer.nil?
if target.text_plain?
debug {"#{log_tag}[extract][text-plain]"}
yield(target.to_utf8_body_data)
yield(target.to_utf8_body_data(max_body_size: @max_body_size))
else
debug {"#{log_tag}[extract][decomposer] not found"}
if target.text?
yield(target.to_utf8_body_data)
yield(target.to_utf8_body_data(max_body_size: @max_body_size))
end
end
else
Expand Down
66 changes: 42 additions & 24 deletions lib/chupa-text/utf8-converter.rb
Expand Up @@ -16,8 +16,9 @@

module ChupaText
class UTF8Converter
def initialize(string)
def initialize(string, max_size: nil)
@string = string
@max_size = max_size
end

def convert
Expand All @@ -26,44 +27,51 @@ def convert
when Encoding::UTF_8
bom_size, bom_encoding = detect_bom
if bom_size
return @string.byteslice(bom_size,
@string.bytesize - bom_size)
utf8_string = @string.byteslice(bom_size,
@string.bytesize - bom_size)
else
return @string
utf8_string = @string
end
return truncate(utf8_string)
when Encoding::ASCII_8BIT
return @string if @string.ascii_only?
return truncate(@string) if @string.ascii_only?
else
return @string.encode(Encoding::UTF_8,
invalid: :replace,
undef: :replace,
replace: "")
utf8_string = @string.encode(Encoding::UTF_8,
invalid: :replace,
undef: :replace,
replace: "")
return truncate(utf8_string)
end

bom_size, bom_encoding = detect_bom
if bom_encoding
string_without_bom = @string.byteslice(bom_size,
@string.bytesize - bom_size)
return string_without_bom.encode(Encoding::UTF_8,
bom_encoding,
invalid: :replace,
undef: :replace,
replace: "")
utf8_string = string_without_bom.encode(Encoding::UTF_8,
bom_encoding,
invalid: :replace,
undef: :replace,
replace: "")
return truncate(utf8_string)
end

guessed_encoding = guess_encoding
if guessed_encoding
@string.encode(Encoding::UTF_8,
guessed_encoding,
invalid: :replace,
undef: :replace,
replace: "")
truncate(@string.encode(Encoding::UTF_8,
guessed_encoding,
invalid: :replace,
undef: :replace,
replace: ""))
else
utf8_body = @string.dup
utf8_body.force_encoding(Encoding::UTF_8)
utf8_body.scrub!("")
utf8_body.gsub!(/\p{Control}+/, "")
utf8_body
if @max_size
utf8_string = @string.byteslice(0, @max_size)
else
utf8_string = @string.dup
end
utf8_string.force_encoding(Encoding::UTF_8)
utf8_string.scrub!("")
utf8_string.gsub!(/\p{Control}+/, "")
utf8_string
end
end

Expand Down Expand Up @@ -113,5 +121,15 @@ def guess_encoding
@string.force_encoding(original_encoding)
end
end

def truncate(string)
if @max_size and string.bytesize > @max_size
truncated = string.byteslice(0, @max_size)
truncated.scrub!("")
truncated
else
string
end
end
end
end
10 changes: 10 additions & 0 deletions test/test-extractor.rb
Expand Up @@ -228,5 +228,15 @@ def test_euc_jp_ascii_8bit
assert_equal(["こんにちは"], extract(data))
end
end

sub_test_case("max body size") do
def test_last_invalid
@extractor = ChupaText::Extractor.new(max_body_size: 5)
data = ChupaText::Data.new
data.mime_type = "text/plain"
data.body = "こん"
assert_equal(["こ"], extract(data))
end
end
end
end

0 comments on commit 8b997e6

Please sign in to comment.