From f220d180b358ba9a8f88ab5bc91d60bcfd1ac15e Mon Sep 17 00:00:00 2001 From: Chris Petersen Date: Fri, 5 Sep 2025 16:41:10 -0700 Subject: [PATCH] Working on pptx parsing --- README.md | 11 +-- ext/parsekit/Cargo.toml | 1 + ext/parsekit/src/parser.rs | 140 ++++++++++++++++++++++++++++++ lib/parsekit/parser.rb | 1 + spec/parsekit/integration_spec.rb | 15 +++- 5 files changed, 154 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0ddd611..2adac91 100644 --- a/README.md +++ b/README.md @@ -121,21 +121,12 @@ excel_text = parser.parse_xlsx(excel_data) | PDF | .pdf | `parse_pdf` | Text extraction via MuPDF | | Word | .docx | `parse_docx` | Office Open XML format | | Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats | -| PowerPoint | .pptx | - | **Not yet supported** - see [implementation plan](docs/PPTX_PLAN.md) | +| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes | | Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract | | JSON | .json | `parse_json` | Pretty-printed output | | XML/HTML | .xml, .html | `parse_xml` | Extracts text content | | Text | .txt, .csv, .md | `parse_text` | With encoding detection | -### Note on PowerPoint Support - -While PPTX files are listed in our features, they are not yet fully implemented. Currently, PPTX files will return binary data instead of extracted text. We have a detailed [implementation plan](docs/PPTX_PLAN.md) for adding proper PPTX support in a future release. This will involve: -- Adding ZIP archive handling capabilities -- Implementing XML extraction from PowerPoint slide files -- Following the same Office Open XML approach used for DOCX files - -For now, if you need to extract text from PowerPoint files, we recommend converting them to PDF first. - ## Performance ParseKit is built with performance in mind: diff --git a/ext/parsekit/Cargo.toml b/ext/parsekit/Cargo.toml index c8a615a..f56b2b6 100644 --- a/ext/parsekit/Cargo.toml +++ b/ext/parsekit/Cargo.toml @@ -20,6 +20,7 @@ tesseract-rs = "0.1" # Tesseract with optional bundling image = "0.25" # Image processing library (match rusty-tesseract's version) calamine = "0.26" # Excel parsing docx-rs = "0.4" # Word document parsing +zip = "2.1" # ZIP archive handling for PPTX quick-xml = "0.36" # XML parsing serde_json = "1.0" # JSON parsing regex = "1.10" # Text parsing diff --git a/ext/parsekit/src/parser.rs b/ext/parsekit/src/parser.rs index 074f470..6048c5c 100644 --- a/ext/parsekit/src/parser.rs +++ b/ext/parsekit/src/parser.rs @@ -78,6 +78,7 @@ impl Parser { match file_type.as_str() { "pdf" => self.parse_pdf(data), "docx" => self.parse_docx(data), + "pptx" => self.parse_pptx(data), "xlsx" | "xls" => self.parse_xlsx(data), "json" => self.parse_json(data), "xml" | "html" => self.parse_xml(data), @@ -328,6 +329,143 @@ impl Parser { } } + /// Parse PPTX (PowerPoint) files - exposed to Ruby + fn parse_pptx(&self, data: Vec) -> Result { + use std::io::{Cursor, Read}; + use zip::ZipArchive; + + let cursor = Cursor::new(data); + let mut archive = match ZipArchive::new(cursor) { + Ok(archive) => archive, + Err(e) => { + return Err(Error::new( + magnus::exception::runtime_error(), + format!("Failed to open PPTX as ZIP: {}", e), + )) + } + }; + + let mut all_text = Vec::new(); + let mut slide_numbers = Vec::new(); + + // First, collect slide numbers and sort them + for i in 0..archive.len() { + let file = match archive.by_index(i) { + Ok(file) => file, + Err(_) => continue, + }; + + let name = file.name(); + // Match slide XML files (e.g., ppt/slides/slide1.xml) + if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") && !name.contains("_rels") { + // Extract slide number from filename + if let Some(num_str) = name + .strip_prefix("ppt/slides/slide") + .and_then(|s| s.strip_suffix(".xml")) + { + if let Ok(num) = num_str.parse::() { + slide_numbers.push((num, i)); + } + } + } + } + + // Sort by slide number to maintain order + slide_numbers.sort_by_key(|&(num, _)| num); + + // Now process slides in order + for (_, index) in slide_numbers { + let mut file = match archive.by_index(index) { + Ok(file) => file, + Err(_) => continue, + }; + + let mut contents = String::new(); + if file.read_to_string(&mut contents).is_ok() { + // Extract text from slide XML + let text = self.extract_text_from_slide_xml(&contents); + if !text.is_empty() { + all_text.push(text); + } + } + } + + // Also extract notes if present + for i in 0..archive.len() { + let mut file = match archive.by_index(i) { + Ok(file) => file, + Err(_) => continue, + }; + + let name = file.name(); + // Match notes slide XML files + if name.starts_with("ppt/notesSlides/notesSlide") && name.ends_with(".xml") && !name.contains("_rels") { + let mut contents = String::new(); + if file.read_to_string(&mut contents).is_ok() { + let text = self.extract_text_from_slide_xml(&contents); + if !text.is_empty() { + all_text.push(format!("[Notes: {}]", text)); + } + } + } + } + + if all_text.is_empty() { + Ok("".to_string()) + } else { + Ok(all_text.join("\n\n")) + } + } + + /// Helper method to extract text from slide XML + fn extract_text_from_slide_xml(&self, xml_content: &str) -> String { + use quick_xml::events::Event; + use quick_xml::Reader; + + let mut reader = Reader::from_str(xml_content); + + let mut text_parts = Vec::new(); + let mut buf = Vec::new(); + let mut in_text_element = false; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + // Look for text elements (a:t or t) + let name = e.name(); + let local_name_bytes = name.local_name(); + let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or(""); + if local_name == "t" { + in_text_element = true; + } + } + Ok(Event::Text(e)) => { + if in_text_element { + if let Ok(text) = e.unescape() { + let text_str = text.trim(); + if !text_str.is_empty() { + text_parts.push(text_str.to_string()); + } + } + } + } + Ok(Event::End(ref e)) => { + let name = e.name(); + let local_name_bytes = name.local_name(); + let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or(""); + if local_name == "t" { + in_text_element = false; + } + } + Ok(Event::Eof) => break, + _ => {} + } + buf.clear(); + } + + text_parts.join(" ") + } + /// Parse Excel files - exposed to Ruby fn parse_xlsx(&self, data: Vec) -> Result { use calamine::{Reader, Xlsx}; @@ -486,6 +624,7 @@ impl Parser { "htm".to_string(), // HTML files (alternative extension) "md".to_string(), // Markdown files "docx".to_string(), + "pptx".to_string(), "xlsx".to_string(), "xls".to_string(), "csv".to_string(), @@ -543,6 +682,7 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> { // Individual parser methods exposed to Ruby class.define_method("parse_pdf", method!(Parser::parse_pdf, 1))?; class.define_method("parse_docx", method!(Parser::parse_docx, 1))?; + class.define_method("parse_pptx", method!(Parser::parse_pptx, 1))?; class.define_method("parse_xlsx", method!(Parser::parse_xlsx, 1))?; class.define_method("parse_json", method!(Parser::parse_json, 1))?; class.define_method("parse_xml", method!(Parser::parse_xml, 1))?; diff --git a/lib/parsekit/parser.rb b/lib/parsekit/parser.rb index 3df230c..24b7a17 100644 --- a/lib/parsekit/parser.rb +++ b/lib/parsekit/parser.rb @@ -89,6 +89,7 @@ def detect_format(path) case ext.downcase when 'docx' then :docx + when 'pptx' then :pptx when 'xlsx', 'xls' then :xlsx when 'pdf' then :pdf when 'json' then :json diff --git a/spec/parsekit/integration_spec.rb b/spec/parsekit/integration_spec.rb index 4fa1cb8..a56edfa 100644 --- a/spec/parsekit/integration_spec.rb +++ b/spec/parsekit/integration_spec.rb @@ -102,10 +102,17 @@ result = parser.parse_file(pptx_file) expect(result).to be_a(String) - # PPTX parsing appears to be broken - returns binary data - # This needs to be fixed in the parser implementation - # For now, we just check it returns a string - # TODO: Fix PPTX parsing and add proper content assertions + expect(result).not_to be_empty + + # Check for content we know is in the sample PPTX + expect(result).to include("Microsoft Powerpoint document") + expect(result).to include("Bullet points") + expect(result).to include("Bold text") + expect(result).to include("Italic text") + expect(result).to include("Unicode") + expect(result).to include("Table example") + expect(result).to include("Column 1") + expect(result).to include("Data A") end end end