Add body size limitation

ranguba · Mar 2, 2019 · 8b997e6 · 8b997e6
1 parent a1ff607
commit 8b997e6
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 32 deletions.
diff --git a/lib/chupa-text/command/chupa-text.rb b/lib/chupa-text/command/chupa-text.rb
@@ -1,4 +1,4 @@
-# Copyright (C) 2013-2017  Kouhei Sutou <kou@clear-code.com>
+# Copyright (C) 2013-2019  Kouhei Sutou <kou@clear-code.com>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -49,6 +49,7 @@ def initialize
         @mime_formatter_options = {}
         @need_screenshot = true
         @expected_screenshot_size = [200, 200]
+        @max_body_size = nil
       end
 
       def run(*arguments)
@@ -143,6 +144,11 @@ def create_option_parser
                   "(default: #{@expected_screenshot_size.join("x")})") do |size|
           @expected_screenshot_size = size
         end
+        parser.on("--max-body-size=BYTE", Integer,
+                  "The max byte of extracted body.",
+                  "(default: no limit)") do |size|
+          @max_body_size = size
+        end
 
         parser.separator("")
         parser.separator("Log related options:")
@@ -190,7 +196,7 @@ def load_decomposers
       end
 
       def create_extractor
-        extractor = Extractor.new
+        extractor = Extractor.new(max_body_size: @max_body_size)
         extractor.apply_configuration(@configuration)
         extractor
       end

diff --git a/lib/chupa-text/data.rb b/lib/chupa-text/data.rb
@@ -198,12 +198,20 @@ def need_screenshot?
       @need_screenshot
     end
 
-    def to_utf8_body_data
-      b = body
+    def to_utf8_body_data(max_body_size: nil)
+      b = nil
+      if max_body_size
+        open do |input|
+          b = input.read(max_body_size)
+        end
+      else
+        b = body
+      end
       return self if b.nil?
+
       converter = UTF8Converter.new(b)
       utf8_body = converter.convert
-      if b.equal?(utf8_body)
+      if max_body_size.nil? and b.equal?(utf8_body)
         self
       else
         TextData.new(utf8_body, source_data: self)

diff --git a/lib/chupa-text/extractor.rb b/lib/chupa-text/extractor.rb
@@ -21,8 +21,9 @@ module ChupaText
   class Extractor
     include Loggable
 
-    def initialize
+    def initialize(max_body_size: nil)
       @decomposers = []
+      @max_body_size = max_body_size
     end
 
     # Sets the extractor up by the configuration. It adds decomposers
@@ -90,11 +91,11 @@ def extract_recursive(target, &block)
       if decomposer.nil?
         if target.text_plain?
           debug {"#{log_tag}[extract][text-plain]"}
-          yield(target.to_utf8_body_data)
+          yield(target.to_utf8_body_data(max_body_size: @max_body_size))
         else
           debug {"#{log_tag}[extract][decomposer] not found"}
           if target.text?
-            yield(target.to_utf8_body_data)
+            yield(target.to_utf8_body_data(max_body_size: @max_body_size))
           end
         end
       else

diff --git a/lib/chupa-text/utf8-converter.rb b/lib/chupa-text/utf8-converter.rb
@@ -16,8 +16,9 @@
 
 module ChupaText
   class UTF8Converter
-    def initialize(string)
+    def initialize(string, max_size: nil)
       @string = string
+      @max_size = max_size
     end
 
     def convert
@@ -26,44 +27,51 @@ def convert
       when Encoding::UTF_8
         bom_size, bom_encoding = detect_bom
         if bom_size
-          return @string.byteslice(bom_size,
-                                   @string.bytesize - bom_size)
+          utf8_string = @string.byteslice(bom_size,
+                                          @string.bytesize - bom_size)
         else
-          return @string
+          utf8_string = @string
         end
+        return truncate(utf8_string)
       when Encoding::ASCII_8BIT
-        return @string if @string.ascii_only?
+        return truncate(@string) if @string.ascii_only?
       else
-        return @string.encode(Encoding::UTF_8,
-                              invalid: :replace,
-                              undef: :replace,
-                              replace: "")
+        utf8_string = @string.encode(Encoding::UTF_8,
+                                     invalid: :replace,
+                                     undef: :replace,
+                                     replace: "")
+        return truncate(utf8_string)
       end
 
       bom_size, bom_encoding = detect_bom
       if bom_encoding
         string_without_bom = @string.byteslice(bom_size,
                                                @string.bytesize - bom_size)
-        return string_without_bom.encode(Encoding::UTF_8,
-                                         bom_encoding,
-                                         invalid: :replace,
-                                         undef: :replace,
-                                         replace: "")
+        utf8_string = string_without_bom.encode(Encoding::UTF_8,
+                                                bom_encoding,
+                                                invalid: :replace,
+                                                undef: :replace,
+                                                replace: "")
+        return truncate(utf8_string)
       end
 
       guessed_encoding = guess_encoding
       if guessed_encoding
-        @string.encode(Encoding::UTF_8,
-                       guessed_encoding,
-                       invalid: :replace,
-                       undef: :replace,
-                       replace: "")
+        truncate(@string.encode(Encoding::UTF_8,
+                                guessed_encoding,
+                                invalid: :replace,
+                                undef: :replace,
+                                replace: ""))
       else
-        utf8_body = @string.dup
-        utf8_body.force_encoding(Encoding::UTF_8)
-        utf8_body.scrub!("")
-        utf8_body.gsub!(/\p{Control}+/, "")
-        utf8_body
+        if @max_size
+          utf8_string = @string.byteslice(0, @max_size)
+        else
+          utf8_string = @string.dup
+        end
+        utf8_string.force_encoding(Encoding::UTF_8)
+        utf8_string.scrub!("")
+        utf8_string.gsub!(/\p{Control}+/, "")
+        utf8_string
       end
     end
 
@@ -113,5 +121,15 @@ def guess_encoding
         @string.force_encoding(original_encoding)
       end
     end
+
+    def truncate(string)
+      if @max_size and string.bytesize > @max_size
+        truncated = string.byteslice(0, @max_size)
+        truncated.scrub!("")
+        truncated
+      else
+        string
+      end
+    end
   end
 end
diff --git a/test/test-extractor.rb b/test/test-extractor.rb
@@ -228,5 +228,15 @@ def test_euc_jp_ascii_8bit
         assert_equal(["こんにちは"], extract(data))
       end
     end
+
+    sub_test_case("max body size") do
+      def test_last_invalid
+        @extractor = ChupaText::Extractor.new(max_body_size: 5)
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "こん"
+        assert_equal(["こ"], extract(data))
+      end
+    end
   end
 end