Skip to content
This repository

Index IO and ObjectHash to prevent excessive pdf sizes #418

Merged
merged 1 commit into from over 1 year ago

2 participants

Jonathan Brad Ediger
Jonathan
Collaborator
  • use object_id for filename or streams as index keys
Jonathan Index IO and ObjectHash to prevent excessive pdf sizes
- use object_id for filename or streams as index keys
419eca8
Brad Ediger bradediger merged commit 419eca8 into from October 31, 2012
Brad Ediger bradediger closed this October 31, 2012
Brad Ediger
Collaborator

:+1: Merged.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 1 unique commit by 1 author.

Oct 30, 2012
Jonathan Index IO and ObjectHash to prevent excessive pdf sizes
- use object_id for filename or streams as index keys
419eca8
This page is out of date. Refresh to see the latest.
67  lib/prawn/core/object_store.rb
@@ -143,23 +143,30 @@ def object_id_for_page(k)
143 143
       #
144 144
       def import_page(input, page_num)
145 145
         @loaded_objects = {}
146  
-        
147  
-        io = if input.respond_to?(:seek) && input.respond_to?(:read)
148  
-          input
149  
-        elsif File.file?(input.to_s)
150  
-          StringIO.new(File.binread(input.to_s))
151  
-        else
152  
-          raise ArgumentError, "input must be an IO-like object or a filename"
  146
+        if template_id = indexed_template(input, page_num)
  147
+          return template_id
153 148
         end
154 149
 
155  
-        # unless File.file?(filename)
  150
+        io = if input.respond_to?(:seek) && input.respond_to?(:read)
  151
+               input
  152
+             elsif File.file?(input.to_s)
  153
+               StringIO.new(File.binread(input.to_s))
  154
+             else
  155
+               raise ArgumentError, "input must be an IO-like object or a filename"
  156
+             end
  157
+
  158
+                # unless File.file?(filename)
156 159
         #   raise ArgumentError, "#{filename} does not exist"
157 160
         # end
158 161
 
159  
-        hash = PDF::Reader::ObjectHash.new(io)
  162
+        hash = indexed_hash(input, io)
160 163
         ref  = hash.page_references[page_num - 1]
161 164
 
162  
-        ref.nil? ? nil : load_object_graph(hash, ref).identifier
  165
+        if ref.nil?
  166
+          nil
  167
+        else
  168
+          index_template(input, page_num, load_object_graph(hash, ref).identifier)
  169
+        end
163 170
 
164 171
       rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError
165 172
         msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug."
@@ -171,6 +178,46 @@ def import_page(input, page_num)
171 178
 
172 179
       private
173 180
 
  181
+      # An index for page templates so that their loaded object graph
  182
+      # can be reused without multiple loading
  183
+      def template_index
  184
+        @template_index ||= {}
  185
+      end
  186
+
  187
+      # An index for the read object hash of a pdf template so that the
  188
+      # object hash does not need to be parsed multiple times when using
  189
+      # different pages of the pdf as page templates
  190
+      def hash_index
  191
+        @hash_index ||= {}
  192
+      end
  193
+
  194
+      # returns the indexed object graph identifier for a template page if
  195
+      # it exists
  196
+      def indexed_template(input, page_number)
  197
+        key = indexing_key(input)
  198
+        template_index[key] && template_index[key][page_number]
  199
+      end
  200
+
  201
+      # indexes the identifier for a page from a template
  202
+      def index_template(input, page_number, id)
  203
+        (template_index[indexing_key(input)] ||= {})[page_number] ||= id
  204
+      end
  205
+
  206
+      # reads and indexes a new IO for a template
  207
+      # if the IO has been indexed already then the parsed object hash
  208
+      # is returned directly
  209
+      def indexed_hash(input, io)
  210
+        hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
  211
+      end
  212
+
  213
+      # the index key for the input.
  214
+      # uses object_id so that both a string filename or an IO stream can be
  215
+      # indexed and reused provided the same object gets used in multiple page
  216
+      # template calls.
  217
+      def indexing_key(input)
  218
+        input.object_id
  219
+      end
  220
+
174 221
       # returns a nested array of object IDs for all pages in this object store.
175 222
       #
176 223
       def get_page_objects(obj)
25  lib/prawn/document.rb
@@ -241,6 +241,9 @@ def page
241 241
     #
242 242
     #  pdf.start_new_page(:template => multipage_template.pdf, :template_page => 2)
243 243
     #
  244
+    # Note: templates get indexed by either the object_id of the filename or stream
  245
+    # entered so that if you reuse the same template multiple times be sure to use the
  246
+    # same instance for more efficient use of resources and smaller rendered pdfs.
244 247
     def start_new_page(options = {})
245 248
       if last_page = state.page
246 249
         last_page_size    = last_page.size
@@ -539,15 +542,15 @@ def @bounding_box.move_past_bottom
539 542
     # through existing pages after they are created.
540 543
     #
541 544
     # Parameters are:
542  
-    # 
543  
-    # <tt>string</tt>:: Template string for page number wording.  
  545
+    #
  546
+    # <tt>string</tt>:: Template string for page number wording.
544 547
     #                   Should include '<page>' and, optionally, '<total>'.
545 548
     # <tt>options</tt>:: A hash for page numbering and text box options.
546  
-    #     <tt>:page_filter</tt>:: A filter to specify which pages to place page numbers on.  
  549
+    #     <tt>:page_filter</tt>:: A filter to specify which pages to place page numbers on.
547 550
     #                             Refer to the method 'page_match?'
548 551
     #     <tt>:start_count_at</tt>:: The starting count to increment pages from.
549 552
     #     <tt>:total_pages</tt>:: If provided, will replace <total> with the value given.
550  
-    #                             Useful to override the total number of pages when using 
  553
+    #                             Useful to override the total number of pages when using
551 554
     #                             the start_count_at option.
552 555
     #     <tt>:color</tt>:: Text fill color.
553 556
     #
@@ -558,7 +561,7 @@ def @bounding_box.move_past_bottom
558 561
     #          five.
559 562
     #
560 563
     #   Prawn::Document.generate("page_with_numbering.pdf") do
561  
-    #     number_pages "<page> in a total of <total>", 
  564
+    #     number_pages "<page> in a total of <total>",
562 565
     #                                          {:start_count_at => 5,
563 566
     #                                           :page_filter => lambda{ |pg| pg != 1 },
564 567
     #                                           :at => [bounds.right - 50, 0],
@@ -578,7 +581,7 @@ def number_pages(string, options={})
578 581
       txtcolor = opts.delete(:color)
579 582
       # An explicit height so that we can draw page numbers in the margins
580 583
       opts[:height] = 50 unless opts.has_key?(:height)
581  
-      
  584
+
582 585
       start_count = false
583 586
       pseudopage = 0
584 587
       (1..page_count).each do |p|
@@ -589,7 +592,7 @@ def number_pages(string, options={})
589 592
                        else
590 593
                          start_count_at.to_i
591 594
                        end
592  
-        end        
  595
+        end
593 596
         if page_match?(page_filter, p)
594 597
           go_to_page(p)
595 598
           # have to use fill_color here otherwise text reverts back to default fill color
@@ -598,13 +601,13 @@ def number_pages(string, options={})
598 601
           str = string.gsub("<page>","#{pseudopage}").gsub("<total>","#{total_pages}")
599 602
           text_box str, opts
600 603
           start_count = true  # increment page count as soon as first match found
601  
-        end 
  604
+        end
602 605
         pseudopage += 1 if start_count
603 606
       end
604 607
     end
605 608
 
606 609
     # Provides a way to execute a block of code repeatedly based on a
607  
-    # page_filter.  
  610
+    # page_filter.
608 611
     #
609 612
     # Available page filters are:
610 613
     #   :all         repeats on every page
@@ -612,7 +615,7 @@ def number_pages(string, options={})
612 615
     #   :even        repeats on even pages
613 616
     #   some_array   repeats on every page listed in the array
614 617
     #   some_range   repeats on every page included in the range
615  
-    #   some_lambda  yields page number and repeats for true return values 
  618
+    #   some_lambda  yields page number and repeats for true return values
616 619
     def page_match?(page_filter, page_number)
617 620
       case page_filter
618 621
       when :all
@@ -626,7 +629,7 @@ def page_match?(page_filter, page_number)
626 629
       when Proc
627 630
         page_filter.call(page_number)
628 631
       end
629  
-    end  
  632
+    end
630 633
 
631 634
 
632 635
     # Returns true if content streams will be compressed before rendering,
49  spec/template_spec.rb
@@ -40,7 +40,7 @@
40 40
                                       :bottom => 36 }
41 41
 
42 42
 
43  
- 
  43
+
44 44
   end
45 45
 
46 46
   it "should not add an extra restore_graphics_state operator to the end of any content stream" do
@@ -57,7 +57,7 @@
57 57
       data.include?("QQ").should == false
58 58
     end
59 59
   end
60  
-    
  60
+
61 61
   it "should have a single page object if importing a single page template" do
62 62
     filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
63 63
 
@@ -161,14 +161,14 @@
161 161
     str = @pdf.render
162 162
     str[0,4].should == "%PDF"
163 163
   end
164  
-  
  164
+
165 165
   context "with the template as a stream" do
166 166
     it "should correctly import a template file from a stream" do
167 167
       filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
168  
-      io = StringIO.new(File.read(filename))      
  168
+      io = StringIO.new(File.read(filename))
169 169
       @pdf = Prawn::Document.new(:template => io)
170 170
       str = @pdf.render
171  
-      str[0,4].should == "%PDF"      
  171
+      str[0,4].should == "%PDF"
172 172
     end
173 173
   end
174 174
 
@@ -176,19 +176,19 @@
176 176
 
177 177
 describe "Document#start_new_page with :template option" do
178 178
   filename = "#{Prawn::BASEDIR}/spec/data/curves.pdf"
179  
-  
  179
+
180 180
   it "should set the imported page's parent to the document pages catalog" do
181 181
     @pdf = Prawn::Document.new()
182 182
     @pdf.start_new_page(:template => filename)
183 183
     @pdf.state.page.dictionary.data[:Parent].should == @pdf.state.store.pages
184 184
   end
185  
-  
  185
+
186 186
   it "should set start the Y cursor at the top of the page" do
187 187
     @pdf = Prawn::Document.new()
188 188
     @pdf.start_new_page(:template => filename)
189 189
     (@pdf.y == nil).should == false
190 190
   end
191  
-  
  191
+
192 192
   it "should respect margins set by Prawn" do
193 193
     @pdf = Prawn::Document.new(:margin => 0)
194 194
     @pdf.start_new_page(:template => filename)
@@ -209,7 +209,7 @@
209 209
                                       :top    => 36,
210 210
                                       :bottom => 36 }
211 211
   end
212  
-  
  212
+
213 213
   it "should not add an extra restore_graphics_state operator to the end of any content stream" do
214 214
     @pdf = Prawn::Document.new
215 215
     @pdf.start_new_page(:template => filename)
@@ -223,7 +223,7 @@
223 223
       data.include?("QQ").should == false
224 224
     end
225 225
   end
226  
-  
  226
+
227 227
   it "should have two content streams if importing a single page template" do
228 228
     filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
229 229
     @pdf = Prawn::Document.new()
@@ -234,7 +234,7 @@
234 234
     template_page = hash[pages[1]]
235 235
     template_page[:Contents].size.should == 2
236 236
   end
237  
-  
  237
+
238 238
   it "should have balance q/Q operators on all content streams" do
239 239
     filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
240 240
 
@@ -251,7 +251,7 @@
251 251
       data.scan("Q").size.should == 1
252 252
     end
253 253
   end
254  
-  
  254
+
255 255
   it "should allow text to be added to a single page template" do
256 256
 
257 257
     @pdf = Prawn::Document.new()
@@ -262,7 +262,7 @@
262 262
     text = PDF::Inspector::Text.analyze(@pdf.render)
263 263
     text.strings.first.should == "Adding some text"
264 264
   end
265  
-  
  265
+
266 266
   it "should allow PDFs with page resources behind an indirect object to be used as templates" do
267 267
     filename = "#{Prawn::DATADIR}/pdfs/resources_as_indirect_object.pdf"
268 268
 
@@ -275,7 +275,7 @@
275 275
     all_text = text.strings.join
276 276
     all_text.include?("Adding some text").should == true
277 277
   end
278  
-  
  278
+
279 279
   it "should correctly add a TTF font to a template that has existing fonts" do
280 280
     filename = "#{Prawn::DATADIR}/pdfs/contains_ttf_font.pdf"
281 281
     @pdf = Prawn::Document.new()
@@ -293,20 +293,31 @@
293 293
     fonts = resources[:Font]
294 294
     fonts.size.should == 2
295 295
   end
296  
-  
  296
+
  297
+  it "indexes template pages when used multiple times" do
  298
+    filename = "#{Prawn::DATADIR}/pdfs/multipage_template.pdf"
  299
+    @repeated_pdf = Prawn::Document.new()
  300
+    3.times { @repeated_pdf.start_new_page(:template => filename) }
  301
+    repeated_hash = PDF::Reader::ObjectHash.new(StringIO.new(@repeated_pdf.render))
  302
+    @sequential_pdf = Prawn::Document.new()
  303
+    (1..3).each { |p| @sequential_pdf.start_new_page(:template => filename, :template_page => p ) }
  304
+    sequential_hash = PDF::Reader::ObjectHash.new(StringIO.new(@sequential_pdf.render))
  305
+    (repeated_hash.size < sequential_hash.size).should == true
  306
+  end
  307
+
297 308
   context "with the template as a stream" do
298 309
     it "should correctly import a template file from a stream" do
299 310
       filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
300 311
       io = StringIO.new(File.read(filename))
301  
-      
  312
+
302 313
       @pdf = Prawn::Document.new()
303 314
       @pdf.start_new_page(:template => io)
304  
-      
  315
+
305 316
       str = @pdf.render
306  
-      str[0,4].should == "%PDF"      
  317
+      str[0,4].should == "%PDF"
307 318
     end
308 319
   end
309  
-  
  320
+
310 321
   context "using template_page option" do
311 322
     it "uses the specified page option" do
312 323
       filename = "#{Prawn::DATADIR}/pdfs/multipage_template.pdf"
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.