From 4167f0ce67e42b082605bca75c7bdfd01eb23804 Mon Sep 17 00:00:00 2001
From: John Lees-Miller <jdleesmiller@gmail.com>
Date: Thu, 12 Sep 2019 22:01:38 +0100
Subject: [PATCH] Validate entry sizes when extracting

---
 README.md                 | 67 +++++++++++++++++++++++++++++++--------
 lib/zip.rb                |  4 ++-
 lib/zip/entry.rb          |  7 ++++
 lib/zip/errors.rb         |  1 +
 test/file_extract_test.rb | 62 ++++++++++++++++++++++++++++++++++++
 5 files changed, 127 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 8255cd90..2ff41ed9 100644
--- a/README.md
+++ b/README.md
@@ -152,12 +152,15 @@ When modifying a zip archive the file permissions of the archive are preserved.
 ### Reading a Zip file
 
 ```ruby
+MAX_SIZE = 1024**2 # 1MiB (but of course you can increase this)
 Zip::File.open('foo.zip') do |zip_file|
   # Handle entries one by one
   zip_file.each do |entry|
-    # Extract to file/directory/symlink
     puts "Extracting #{entry.name}"
-    entry.extract(dest_file)
+    raise 'File too large when extracted' if entry.size > MAX_SIZE
+
+    # Extract to file or directory based on name in the archive
+    entry.extract
 
     # Read into memory
     content = entry.get_input_stream.read
@@ -165,6 +168,7 @@ Zip::File.open('foo.zip') do |zip_file|
 
   # Find specific entry
   entry = zip_file.glob('*.csv').first
+  raise 'File too large when extracted' if entry.size > MAX_SIZE
   puts entry.get_input_stream.read
 end
 ```
@@ -219,6 +223,8 @@ File.open(new_path, "wb") {|f| f.write(buffer.string) }
 
 ## Configuration
 
+### Existing Files
+
 By default, rubyzip will not overwrite files if they already exist inside of the extracted path. To change this behavior, you may specify a configuration option like so:
 
 ```ruby
@@ -233,18 +239,57 @@ Additionally, if you want to configure rubyzip to overwrite existing files while
 Zip.continue_on_exists_proc = true
 ```
 
+### Non-ASCII Names
+
 If you want to store non-english names and want to open them on Windows(pre 7) you need to set this option:
 
 ```ruby
 Zip.unicode_names = true
 ```
 
+Sometimes file names inside zip contain non-ASCII characters. If you can assume which encoding was used for such names and want to be able to find such entries using `find_entry` then you can force assumed encoding like so:
+
+```ruby
+Zip.force_entry_names_encoding = 'UTF-8'
+```
+
+Allowed encoding names are the same as accepted by `String#force_encoding`
+
+### Date Validation
+
 Some zip files might have an invalid date format, which will raise a warning. You can hide this warning with the following setting:
 
 ```ruby
 Zip.warn_invalid_date = false
 ```
 
+### Size Validation
+
+By default, `rubyzip`'s `extract` method checks that an entry's reported uncompressed size is not (significantly) smaller than its actual size. This is to help you protect your application against [zip bombs](https://en.wikipedia.org/wiki/Zip_bomb). Before `extract`ing an entry, you should check that its size is in the range you expect. For example, if your application supports processing up to 100 files at once, each up to 10MiB, your zip extraction code might look like:
+
+```ruby
+MAX_FILE_SIZE = 10 * 1024**2 # 10MiB
+MAX_FILES = 100
+Zip::File.open('foo.zip') do |zip_file|
+  num_files = 0
+  zip_file.each do |entry|
+    num_files += 1 if entry.file?
+    raise 'Too many extracted files' if num_files > MAX_FILES
+    raise 'File too large when extracted' if entry.size > MAX_FILE_SIZE
+    entry.extract
+  end
+end
+```
+
+If you need to extract zip files that report incorrect uncompressed sizes and you really trust them not too be too large, you can disable this setting with
+```ruby
+Zip.validate_entry_sizes = false
+```
+
+Note that if you use the lower level `Zip::InputStream` interface, `rubyzip` does *not* check the entry `size`s. In this case, the caller is responsible for making sure it does not read more data than expected from the input stream.
+
+### Default Compression
+
 You can set the default compression level like so:
 
 ```ruby
@@ -253,13 +298,17 @@ Zip.default_compression = Zlib::DEFAULT_COMPRESSION
 
 It defaults to `Zlib::DEFAULT_COMPRESSION`. Possible values are `Zlib::BEST_COMPRESSION`, `Zlib::DEFAULT_COMPRESSION` and `Zlib::NO_COMPRESSION`
 
-Sometimes file names inside zip contain non-ASCII characters. If you can assume which encoding was used for such names and want to be able to find such entries using `find_entry` then you can force assumed encoding like so:
+### Zip64 Support
+
+By default, Zip64 support is disabled for writing. To enable it do this:
 
 ```ruby
-Zip.force_entry_names_encoding = 'UTF-8'
+Zip.write_zip64_support = true
 ```
 
-Allowed encoding names are the same as accepted by `String#force_encoding`
+_NOTE_: If you will enable Zip64 writing then you will need zip extractor with Zip64 support to extract archive.
+
+### Block Form
 
 You can set multiple settings at the same time by using a block:
 
@@ -272,14 +321,6 @@ You can set multiple settings at the same time by using a block:
   end
 ```
 
-By default, Zip64 support is disabled for writing. To enable it do this:
-
-```ruby
-Zip.write_zip64_support = true
-```
-
-_NOTE_: If you will enable Zip64 writing then you will need zip extractor with Zip64 support to extract archive.
-
 ## Developing
 
 To run the test you need to do this:
diff --git a/lib/zip.rb b/lib/zip.rb
index 9145207b..c3a6ed5e 100644
--- a/lib/zip.rb
+++ b/lib/zip.rb
@@ -42,7 +42,8 @@ module Zip
                 :write_zip64_support,
                 :warn_invalid_date,
                 :case_insensitive_match,
-                :force_entry_names_encoding
+                :force_entry_names_encoding,
+                :validate_entry_sizes
 
   def reset!
     @_ran_once = false
@@ -54,6 +55,7 @@ def reset!
     @write_zip64_support = false
     @warn_invalid_date = true
     @case_insensitive_match = false
+    @validate_entry_sizes = true
   end
 
   def setup
diff --git a/lib/zip/entry.rb b/lib/zip/entry.rb
index 80160b57..bd3e4f34 100644
--- a/lib/zip/entry.rb
+++ b/lib/zip/entry.rb
@@ -603,9 +603,16 @@ def create_file(dest_path, _continue_on_exists_proc = proc { Zip.continue_on_exi
         get_input_stream do |is|
           set_extra_attributes_on_path(dest_path)
 
+          bytes_written = 0
           buf = ''.dup
           while (buf = is.sysread(::Zip::Decompressor::CHUNK_SIZE, buf))
             os << buf
+
+            next unless ::Zip.validate_entry_sizes
+            bytes_written += buf.bytesize
+            if bytes_written > size
+              raise ::Zip::EntrySizeError, "Entry #{name} should be #{size}B but is larger when inflated"
+            end
           end
         end
       end
diff --git a/lib/zip/errors.rb b/lib/zip/errors.rb
index b2bcccd2..364c6eee 100644
--- a/lib/zip/errors.rb
+++ b/lib/zip/errors.rb
@@ -4,6 +4,7 @@ class EntryExistsError < Error; end
   class DestinationFileExistsError < Error; end
   class CompressionMethodError < Error; end
   class EntryNameError < Error; end
+  class EntrySizeError < Error; end
   class InternalError < Error; end
   class GPFBit3Error < Error; end
 
diff --git a/test/file_extract_test.rb b/test/file_extract_test.rb
index 57833fcb..6103aeae 100644
--- a/test/file_extract_test.rb
+++ b/test/file_extract_test.rb
@@ -10,6 +10,10 @@ def setup
     ::File.delete(EXTRACTED_FILENAME) if ::File.exist?(EXTRACTED_FILENAME)
   end
 
+  def teardown
+    ::Zip.reset!
+  end
+
   def test_extract
     ::Zip::File.open(TEST_ZIP.zip_name) do |zf|
       zf.extract(ENTRY_TO_EXTRACT, EXTRACTED_FILENAME)
@@ -80,4 +84,62 @@ def test_extract_non_entry_2
     end
     assert(!File.exist?(outFile))
   end
+
+  def test_extract_incorrect_size
+    # The uncompressed size fields in the zip file cannot be trusted. This makes
+    # it harder for callers to validate the sizes of the files they are
+    # extracting, which can lead to denial of service. See also
+    # https://en.wikipedia.org/wiki/Zip_bomb
+    Dir.mktmpdir do |tmp|
+      real_zip = File.join(tmp, 'real.zip')
+      fake_zip = File.join(tmp, 'fake.zip')
+      file_name = 'a'
+      true_size = 500_000
+      fake_size = 1
+
+      ::Zip::File.open(real_zip, ::Zip::File::CREATE) do |zf|
+        zf.get_output_stream(file_name) do |os|
+          os.write 'a' * true_size
+        end
+      end
+
+      compressed_size = nil
+      ::Zip::File.open(real_zip) do |zf|
+        a_entry = zf.find_entry(file_name)
+        compressed_size = a_entry.compressed_size
+        assert_equal true_size, a_entry.size
+      end
+
+      true_size_bytes = [compressed_size, true_size, file_name.size].pack('LLS')
+      fake_size_bytes = [compressed_size, fake_size, file_name.size].pack('LLS')
+
+      data = File.binread(real_zip)
+      assert data.include?(true_size_bytes)
+      data.gsub! true_size_bytes, fake_size_bytes
+
+      File.open(fake_zip, 'wb') do |file|
+        file.write data
+      end
+
+      Dir.chdir tmp do
+        ::Zip::File.open(fake_zip) do |zf|
+          a_entry = zf.find_entry(file_name)
+          assert_equal fake_size, a_entry.size
+
+          ::Zip.validate_entry_sizes = false
+          a_entry.extract
+          assert_equal true_size, File.size(file_name)
+          FileUtils.rm file_name
+
+          ::Zip.validate_entry_sizes = true
+          error = assert_raises ::Zip::EntrySizeError do
+            a_entry.extract
+          end
+          assert_equal \
+            'Entry a should be 1B but is larger when inflated',
+            error.message
+        end
+      end
+    end
+  end
 end