From a9732372f531e435a3330d8ab5bd44ce2cb57b0b Mon Sep 17 00:00:00 2001 From: Ben Woosley Date: Sat, 22 Nov 2014 02:53:05 -0800 Subject: [PATCH] In Excelx, load styles, shared strings and the workbook lazily. Leave the tmpdir open so that reading may take place after initialize. The OS will be responsible for cleaning it up. --- CHANGELOG | 3 +- lib/roo/base.rb | 9 +- lib/roo/excelx.rb | 187 ++++++++++++++++++++++------------------- lib/roo/open_office.rb | 2 +- 4 files changed, 110 insertions(+), 91 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4b852702..a1a295fa 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,7 +6,8 @@ * Extract Roo::Excel and Roo::Excel2003XML to roo-xls * Extract Roo::Google to roo-google * Accept the tmpdir_root option in Roo::Excelx - * Chang the tmpdir prefix from oo_ to roo_ + * Change the tmpdir prefix from oo_ to roo_ + * In Excelx, load styles, shared strings and the workbook lazily. Leave the tmpdir open so that reading may take place after initialize. The OS will be responsible for cleaning it up. * bugfixes * Fix that paths with spaces in them would fail with diff --git a/lib/roo/base.rb b/lib/roo/base.rb index 45823cb0..5cf3137b 100644 --- a/lib/roo/base.rb +++ b/lib/roo/base.rb @@ -605,8 +605,13 @@ def reinitialize initialize(@filename) end - def make_tmpdir(tmp_root = nil, &block) - Dir.mktmpdir(TEMP_PREFIX, tmp_root || ENV['ROO_TMP'], &block) + def make_tmpdir(prefix = nil, root = nil, &block) + if prefix + prefix = TEMP_PREFIX + prefix + else + prefix = TEMP_PREFIX + end + Dir.mktmpdir(prefix, root || ENV['ROO_TMP'], &block) end def clean_sheet(sheet) diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb index decefc99..3a8b1f0e 100644 --- a/lib/roo/excelx.rb +++ b/lib/roo/excelx.rb @@ -71,32 +71,21 @@ def initialize(filename, options = {}) file_warning = options[:file_warning] || :error file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed) - make_tmpdir(options[:tmpdir_root]) do |tmpdir| - @filename = local_filename(filename, tmpdir, packed) - @comments_files = [] - @rels_files = [] - process_zipfile(tmpdir, @filename) - @workbook_doc = load_xml(File.join(tmpdir, "roo_workbook.xml")).remove_namespaces! - @shared_table = [] - if File.exist?(File.join(tmpdir, 'roo_sharedStrings.xml')) - @sharedstring_doc = load_xml(File.join(tmpdir, 'roo_sharedStrings.xml')).remove_namespaces! - read_shared_strings(@sharedstring_doc) - end - @styles_table = [] - @style_definitions = [] # TODO: ??? { |h,k| h[k] = {} } - if File.exist?(File.join(tmpdir, 'roo_styles.xml')) - @styles_doc = load_xml(File.join(tmpdir, 'roo_styles.xml')).remove_namespaces! - read_styles(@styles_doc) - end - @sheet_doc = load_xmls(@sheet_files) - @comments_doc = load_xmls(@comments_files) - @rels_doc = load_xmls(@rels_files) - end + + @tmpdir = make_tmpdir(filename.split('/').last, options[:tmpdir_root]) + @filename = local_filename(filename, @tmpdir, packed) + @comments_files = [] + @rels_files = [] + process_zipfile(@tmpdir, @filename) + @sheet_doc = load_xmls(@sheet_files) + @comments_doc = load_xmls(@comments_files) + @rels_doc = load_xmls(@rels_files) + super(filename, options) @formula = {} @excelx_type = {} @excelx_value = {} - @style = {} # TODO: ggf. wieder entfernen nur lokal benoetigt + @style = {} @comment = {} @comments_read = {} @hyperlink = {} @@ -184,10 +173,7 @@ def font(row, col, sheet=nil) sheet ||= @default_sheet read_cells(sheet) row,col = normalize(row,col) - style = @style[sheet][[row,col]] - style ||= 0 - style = style.to_i - @style_definitions[style] + style_definitions[@style[sheet][[row,col]].to_i] end # returns the type of a cell: @@ -238,13 +224,12 @@ def excelx_format(row,col,sheet=nil) sheet ||= @default_sheet read_cells(sheet) row,col = normalize(row,col) - style = @style[sheet][[row,col]] - style_format(style).to_s + style_format(@style[sheet][[row,col]]).to_s end # returns an array of sheet names in the spreadsheet def sheets - @workbook_doc.xpath("//sheet").map do |sheet| + workbook_doc.xpath("//sheet").map do |sheet| sheet['name'] end end @@ -329,6 +314,14 @@ def comments(sheet=nil) private + def workbook_doc + @workbook_doc ||= load_xml(File.join(@tmpdir, "roo_workbook.xml")) + end + + def load_xml(path) + super.remove_namespaces! + end + def load_xmls(paths) paths.compact.map do |item| load_xml(item).remove_namespaces! @@ -428,7 +421,7 @@ def read_cell_from_xml(sheet, cell_xml) when :shared value_type = :string excelx_type = :string - @shared_table[cell.content.to_i] + shared_strings[cell.content.to_i] when :boolean (cell.content.to_i == 1 ? 'TRUE' : 'FALSE') when :date @@ -511,7 +504,7 @@ def read_comments(sheet=nil) sheet ||= @default_sheet validate_sheet!(sheet) n = self.sheets.index(sheet) - return unless @comments_doc[n] #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + return unless @comments_doc[n] @comments_doc[n].xpath("//comments/commentList/comment").each do |comment| ref = comment.attributes['ref'].to_s row,col = self.class.split_coordinate(ref) @@ -544,7 +537,7 @@ def read_hyperlinks(sheet=nil) end def read_labels - @label ||= Hash[@workbook_doc.xpath("//definedName").map do |defined_name| + @label ||= Hash[workbook_doc.xpath("//definedName").map do |defined_name| # "Sheet1!$C$5" sheet, coordinates = defined_name.text.split('!$', 2) col,row = coordinates.split('$') @@ -555,102 +548,122 @@ def read_labels # Extracts all needed files from the zip file def process_zipfile(tmpdir, zipfilename) @sheet_files = [] - Roo::ZipFile.open(zipfilename) {|zf| - zf.entries.each {|entry| + Roo::ZipFile.open(zipfilename) do |zipfile| + zipfile.entries.each do |entry| entry_name = entry.to_s.downcase path = - if entry_name.end_with?('workbook.xml') + case entry_name + when /workbook.xml$/ "#{tmpdir}/roo_workbook.xml" - elsif entry_name.end_with?('sharedstrings.xml') + when /sharedstrings.xml$/ "#{tmpdir}/roo_sharedStrings.xml" - elsif entry_name.end_with?('styles.xml') + when /styles.xml$/ "#{tmpdir}/roo_styles.xml" - elsif entry_name =~ /sheet([0-9]+)?.xml$/ - nr = $1 - path = "#{tmpdir}/roo_sheet#{nr.to_i}" - + when /sheet.xml$/ + path = "#{tmpdir}/roo_sheet" + @sheet_files.unshift path + path + when /sheet([0-9]+).xml$/ # Numbers 3.1 exports first sheet without sheet number. Such sheets # are always added to the beginning of the array which, naturally, # causes other sheets to be pushed to the next index which could # lead to sheet references getting overwritten, so we need to # handle that case specifically. - if nr - sheet_files_index = nr.to_i - 1 - sheet_files_index += 1 if @sheet_files[sheet_files_index] - @sheet_files[sheet_files_index] = path - else - @sheet_files.unshift path - path - end - elsif entry_name =~ /comments([0-9]+).xml$/ + nr = $1 + sheet_files_index = nr.to_i - 1 + sheet_files_index += 1 if @sheet_files[sheet_files_index] + @sheet_files[sheet_files_index] = "#{tmpdir}/roo_sheet#{nr.to_i}" + when /comments([0-9]+).xml$/ nr = $1 @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}" - elsif entry_name =~ /sheet([0-9]+).xml.rels$/ + when /sheet([0-9]+).xml.rels$/ nr = $1 @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}" end if path - extract_file(zf, entry, path) + File.write(path, zipfile.read(entry), mode: 'wb') end - } - } - end - - def extract_file(source_zip, entry, destination_path) - File.open(destination_path,'wb') {|f| - f << source_zip.read(entry) - } + end + end end - # read the shared strings xml document - def read_shared_strings(doc) - doc.xpath("/sst/si").each do |si| - shared_table_entry = '' - si.children.each do |elem| - if elem.name == 'r' and elem.children - elem.children.each do |r_elem| - if r_elem.name == 't' - shared_table_entry << r_elem.content + def shared_strings + @shared_strings ||= + if File.exist?(shared_strings_path) + # read the shared strings xml document + xml = load_xml(shared_strings_path) + xml.xpath("/sst/si").map do |si| + shared_string = '' + si.children.each do |elem| + case elem.name + when 'r' + elem.children.each do |r_elem| + if r_elem.name == 't' + shared_string << r_elem.content + end + end + when 't' + shared_string = elem.content end end + shared_string end - if elem.name == 't' - shared_table_entry = elem.content - end + else + [] + end + end + + def shared_strings_path + @shared_strings_path ||= File.join(@tmpdir, 'roo_sharedStrings.xml') + end + + ##### STYLES + def style_definitions + @style_definitions ||= styles_doc.xpath("//cellXfs").flat_map do |xfs| + xfs.children.map do |xf| + fonts[xf['fontId'].to_i] end - @shared_table << shared_table_entry end end - # read the styles elements of an excelx document - def read_styles(doc) - @cellXfs = [] + def num_fmt_ids + @num_fmt_ids ||= styles_doc.xpath("//cellXfs").flat_map do |xfs| + xfs.children.map do |xf| + xf['numFmtId'] + end + end + end - @numFmts = Hash[doc.xpath("//numFmt").map do |numFmt| - [numFmt['numFmtId'], numFmt['formatCode']] + def num_fmts + @num_fmts ||= Hash[styles_doc.xpath("//numFmt").map do |num_fmt| + [num_fmt['numFmtId'], num_fmt['formatCode']] end] - fonts = doc.xpath("//fonts/font").map do |font_el| + end + + def fonts + @fonts ||= styles_doc.xpath("//fonts/font").map do |font_el| Font.new.tap do |font| font.bold = !font_el.xpath('./b').empty? font.italic = !font_el.xpath('./i').empty? font.underline = !font_el.xpath('./u').empty? end end + end - doc.xpath("//cellXfs").each do |xfs| - xfs.children.each do |xf| - @cellXfs << xf['numFmtId'] - @style_definitions << fonts[xf['fontId'].to_i] + def styles_doc + @styles_doc ||= + if File.exist?(File.join(@tmpdir, 'roo_styles.xml')) + load_xml(File.join(@tmpdir, 'roo_styles.xml')) end - end end # convert internal excelx attribute to a format def style_format(style) - id = @cellXfs[style.to_i] - @numFmts[id] || Format::STANDARD_FORMATS[id.to_i] + id = num_fmt_ids[style.to_i] + num_fmts[id] || Format::STANDARD_FORMATS[id.to_i] end + ###### END STYLES def base_date @base_date ||= @@ -658,7 +671,7 @@ def base_date # Default to 1900 (minus one day due to excel quirk) but use 1904 if # it's set in the Workbook's workbookPr # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx - @workbook_doc.css("workbookPr[date1904]").each do |workbookPr| + workbook_doc.css("workbookPr[date1904]").each do |workbookPr| if workbookPr["date1904"] =~ /true|1/i return Date.new(1904,01,01) end diff --git a/lib/roo/open_office.rb b/lib/roo/open_office.rb index abf54425..0666f6c9 100644 --- a/lib/roo/open_office.rb +++ b/lib/roo/open_office.rb @@ -35,7 +35,7 @@ def initialize(filename, options={}) file_warning = options[:file_warning] || :error file_type_check(filename,'.ods','an Roo::OpenOffice', file_warning, packed) - make_tmpdir(options[:tmpdir_root]) do |tmpdir| + make_tmpdir(nil, options[:tmpdir_root]) do |tmpdir| @filename = local_filename(filename, tmpdir, packed) #TODO: @cells_read[:default] = false self.class.extract_content(tmpdir, @filename)