Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,12 @@ excel_text = parser.parse_xlsx(excel_data)
| PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
| Word | .docx | `parse_docx` | Office Open XML format |
| Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
| PowerPoint | .pptx | - | **Not yet supported** - see [implementation plan](docs/PPTX_PLAN.md) |
| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
| Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
| JSON | .json | `parse_json` | Pretty-printed output |
| XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
| Text | .txt, .csv, .md | `parse_text` | With encoding detection |

### Note on PowerPoint Support

While PPTX files are listed in our features, they are not yet fully implemented. Currently, PPTX files will return binary data instead of extracted text. We have a detailed [implementation plan](docs/PPTX_PLAN.md) for adding proper PPTX support in a future release. This will involve:
- Adding ZIP archive handling capabilities
- Implementing XML extraction from PowerPoint slide files
- Following the same Office Open XML approach used for DOCX files

For now, if you need to extract text from PowerPoint files, we recommend converting them to PDF first.

## Performance

ParseKit is built with performance in mind:
Expand Down
1 change: 1 addition & 0 deletions ext/parsekit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ tesseract-rs = "0.1" # Tesseract with optional bundling
image = "0.25" # Image processing library (match rusty-tesseract's version)
calamine = "0.26" # Excel parsing
docx-rs = "0.4" # Word document parsing
zip = "2.1" # ZIP archive handling for PPTX
quick-xml = "0.36" # XML parsing
serde_json = "1.0" # JSON parsing
regex = "1.10" # Text parsing
Expand Down
140 changes: 140 additions & 0 deletions ext/parsekit/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ impl Parser {
match file_type.as_str() {
"pdf" => self.parse_pdf(data),
"docx" => self.parse_docx(data),
"pptx" => self.parse_pptx(data),
"xlsx" | "xls" => self.parse_xlsx(data),
"json" => self.parse_json(data),
"xml" | "html" => self.parse_xml(data),
Expand Down Expand Up @@ -328,6 +329,143 @@ impl Parser {
}
}

/// Parse PPTX (PowerPoint) files - exposed to Ruby
fn parse_pptx(&self, data: Vec<u8>) -> Result<String, Error> {
use std::io::{Cursor, Read};
use zip::ZipArchive;

let cursor = Cursor::new(data);
let mut archive = match ZipArchive::new(cursor) {
Ok(archive) => archive,
Err(e) => {
return Err(Error::new(
magnus::exception::runtime_error(),
format!("Failed to open PPTX as ZIP: {}", e),
))
}
};

let mut all_text = Vec::new();
let mut slide_numbers = Vec::new();

// First, collect slide numbers and sort them
for i in 0..archive.len() {
let file = match archive.by_index(i) {
Ok(file) => file,
Err(_) => continue,
};

let name = file.name();
// Match slide XML files (e.g., ppt/slides/slide1.xml)
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") && !name.contains("_rels") {
// Extract slide number from filename
if let Some(num_str) = name
.strip_prefix("ppt/slides/slide")
.and_then(|s| s.strip_suffix(".xml"))
{
if let Ok(num) = num_str.parse::<usize>() {
slide_numbers.push((num, i));
}
}
}
}

// Sort by slide number to maintain order
slide_numbers.sort_by_key(|&(num, _)| num);

// Now process slides in order
for (_, index) in slide_numbers {
let mut file = match archive.by_index(index) {
Ok(file) => file,
Err(_) => continue,
};

let mut contents = String::new();
if file.read_to_string(&mut contents).is_ok() {
// Extract text from slide XML
let text = self.extract_text_from_slide_xml(&contents);
if !text.is_empty() {
all_text.push(text);
}
}
}

// Also extract notes if present
for i in 0..archive.len() {
let mut file = match archive.by_index(i) {
Ok(file) => file,
Err(_) => continue,
};

let name = file.name();
// Match notes slide XML files
if name.starts_with("ppt/notesSlides/notesSlide") && name.ends_with(".xml") && !name.contains("_rels") {
let mut contents = String::new();
if file.read_to_string(&mut contents).is_ok() {
let text = self.extract_text_from_slide_xml(&contents);
if !text.is_empty() {
all_text.push(format!("[Notes: {}]", text));
}
}
}
}

if all_text.is_empty() {
Ok("".to_string())
} else {
Ok(all_text.join("\n\n"))
}
}

/// Helper method to extract text from slide XML
fn extract_text_from_slide_xml(&self, xml_content: &str) -> String {
use quick_xml::events::Event;
use quick_xml::Reader;

let mut reader = Reader::from_str(xml_content);

let mut text_parts = Vec::new();
let mut buf = Vec::new();
let mut in_text_element = false;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
// Look for text elements (a:t or t)
let name = e.name();
let local_name_bytes = name.local_name();
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
if local_name == "t" {
in_text_element = true;
}
}
Ok(Event::Text(e)) => {
if in_text_element {
if let Ok(text) = e.unescape() {
let text_str = text.trim();
if !text_str.is_empty() {
text_parts.push(text_str.to_string());
}
}
}
}
Ok(Event::End(ref e)) => {
let name = e.name();
let local_name_bytes = name.local_name();
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
if local_name == "t" {
in_text_element = false;
}
}
Ok(Event::Eof) => break,
_ => {}
}
buf.clear();
}

text_parts.join(" ")
}

/// Parse Excel files - exposed to Ruby
fn parse_xlsx(&self, data: Vec<u8>) -> Result<String, Error> {
use calamine::{Reader, Xlsx};
Expand Down Expand Up @@ -486,6 +624,7 @@ impl Parser {
"htm".to_string(), // HTML files (alternative extension)
"md".to_string(), // Markdown files
"docx".to_string(),
"pptx".to_string(),
"xlsx".to_string(),
"xls".to_string(),
"csv".to_string(),
Expand Down Expand Up @@ -543,6 +682,7 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
// Individual parser methods exposed to Ruby
class.define_method("parse_pdf", method!(Parser::parse_pdf, 1))?;
class.define_method("parse_docx", method!(Parser::parse_docx, 1))?;
class.define_method("parse_pptx", method!(Parser::parse_pptx, 1))?;
class.define_method("parse_xlsx", method!(Parser::parse_xlsx, 1))?;
class.define_method("parse_json", method!(Parser::parse_json, 1))?;
class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
Expand Down
1 change: 1 addition & 0 deletions lib/parsekit/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def detect_format(path)

case ext.downcase
when 'docx' then :docx
when 'pptx' then :pptx
when 'xlsx', 'xls' then :xlsx
when 'pdf' then :pdf
when 'json' then :json
Expand Down
15 changes: 11 additions & 4 deletions spec/parsekit/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,17 @@

result = parser.parse_file(pptx_file)
expect(result).to be_a(String)
# PPTX parsing appears to be broken - returns binary data
# This needs to be fixed in the parser implementation
# For now, we just check it returns a string
# TODO: Fix PPTX parsing and add proper content assertions
expect(result).not_to be_empty

# Check for content we know is in the sample PPTX
expect(result).to include("Microsoft Powerpoint document")
expect(result).to include("Bullet points")
expect(result).to include("Bold text")
expect(result).to include("Italic text")
expect(result).to include("Unicode")
expect(result).to include("Table example")
expect(result).to include("Column 1")
expect(result).to include("Data A")
end
end
end
Expand Down