Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/vendor/
/composer.lock
/.phpunit.cache/
/.phpunit.result.cache
29 changes: 29 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "splitbrain/doc-extract",
"description": "Lightweight PHP library to extract plain text from DOCX, XLSX, PPTX and PDF files.",
"type": "library",
"license": "MIT",
"require": {
"php": "^8.1",
"ext-dom": "*",
"ext-xmlreader": "*",
"splitbrain/php-archive": "^1.4.2",
"smalot/pdfparser": "^2.10"
},
"require-dev": {
"phpunit/phpunit": "^10.5"
},
"autoload": {
"psr-4": {
"Splitbrain\\DocExtract\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"Splitbrain\\DocExtract\\Tests\\": "tests/"
}
},
"config": {
"sort-packages": true
}
}
17 changes: 17 additions & 0 deletions phpunit.xml.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
bootstrap="vendor/autoload.php"
colors="true"
cacheDirectory=".phpunit.cache">
<testsuites>
<testsuite name="doc-extract">
<directory>tests</directory>
</testsuite>
</testsuites>
<source>
<include>
<directory>src</directory>
</include>
</source>
</phpunit>
9 changes: 9 additions & 0 deletions src/Exception/ExtractionException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract\Exception;

class ExtractionException extends \RuntimeException
{
}
9 changes: 9 additions & 0 deletions src/Exception/UnsupportedFormatException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract\Exception;

class UnsupportedFormatException extends ExtractionException
{
}
20 changes: 20 additions & 0 deletions src/Extractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract;

interface Extractor
{
/**
* Extract plain text from the given file.
*
* @throws Exception\ExtractionException on I/O or parse failure
*/
public function extract(string $path): string;

/**
* Whether this extractor can handle the given file (based on extension).
*/
public function supports(string $path): bool;
}
160 changes: 160 additions & 0 deletions src/Extractor/AbstractOoxmlExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract\Extractor;

use FilesystemIterator;
use RecursiveDirectoryIterator;
use RecursiveIteratorIterator;
use Splitbrain\DocExtract\Exception\ExtractionException;
use Splitbrain\DocExtract\Extractor;
use splitbrain\PHPArchive\Zip;
use XMLReader;

abstract class AbstractOoxmlExtractor implements Extractor
{
private string $tempDir = '';

abstract protected function extension(): string;

abstract protected function extractText(): string;

public function supports(string $path): bool
{
return strtolower(pathinfo($path, PATHINFO_EXTENSION)) === $this->extension();
}

public function extract(string $path): string
{
if (!is_file($path)) {
throw new ExtractionException("File not found: $path");
}

$this->tempDir = $this->makeTempDir();
try {
$zip = new Zip();
$zip->open($path);
$zip->extract($this->tempDir);
$zip->close();

return $this->extractText();
} catch (ExtractionException $e) {
throw $e;
} catch (\Throwable $e) {
throw new ExtractionException(
"Failed to extract text from $path: " . $e->getMessage(),
0,
$e,
);
} finally {
if ($this->tempDir !== '') {
$this->cleanup($this->tempDir);
$this->tempDir = '';
}
}
}

protected function readPart(string $internalPath): ?string
{
$full = $this->tempDir . '/' . ltrim($internalPath, '/');
if (!is_file($full)) {
return null;
}
$data = file_get_contents($full);
return $data === false ? null : $data;
}

/**
* @return string[] internal paths (relative to archive root) matching the prefix
*/
protected function listParts(string $prefix): array
{
if (!is_dir($this->tempDir)) {
return [];
}
$base = $this->tempDir . '/';
$results = [];
$it = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($this->tempDir, FilesystemIterator::SKIP_DOTS),
);
foreach ($it as $file) {
if (!$file->isFile()) {
continue;
}
$rel = str_replace('\\', '/', substr($file->getPathname(), strlen($base)));
if (str_starts_with($rel, $prefix)) {
$results[] = $rel;
}
}
sort($results, SORT_NATURAL);
return $results;
}

/**
* Stream-parse XML and concatenate text from elements matching $textElement.
* Block elements emit a newline; tab elements emit a tab.
*/
protected function extractTextFromXml(
string $xml,
string $textElement,
array $blockElements = [],
array $tabElements = [],
): string {
$reader = new XMLReader();
if (!$reader->XML($xml, 'UTF-8', LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
throw new ExtractionException('Failed to parse XML');
}
try {
$out = '';
$blocks = array_flip($blockElements);
$tabs = array_flip($tabElements);
while ($reader->read()) {
if ($reader->nodeType !== XMLReader::ELEMENT) {
continue;
}
$local = $reader->localName;
if ($local === $textElement) {
$out .= $reader->readString();
} elseif (isset($blocks[$local])) {
if ($out !== '' && !str_ends_with($out, "\n")) {
$out .= "\n";
}
} elseif (isset($tabs[$local])) {
$out .= "\t";
}
}
return $out;
} finally {
$reader->close();
}
}

private function makeTempDir(): string
{
$dir = sys_get_temp_dir() . '/doc-extract-' . bin2hex(random_bytes(8));
if (!@mkdir($dir, 0700, true) && !is_dir($dir)) {
throw new ExtractionException("Could not create temp dir: $dir");
}
return $dir;
}

private function cleanup(string $dir): void
{
if (!is_dir($dir)) {
return;
}
$it = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($dir, FilesystemIterator::SKIP_DOTS),
RecursiveIteratorIterator::CHILD_FIRST,
);
foreach ($it as $file) {
if ($file->isDir()) {
@rmdir($file->getPathname());
} else {
@unlink($file->getPathname());
}
}
@rmdir($dir);
}
}
56 changes: 56 additions & 0 deletions src/Extractor/DocxExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract\Extractor;

use Splitbrain\DocExtract\Exception\ExtractionException;

final class DocxExtractor extends AbstractOoxmlExtractor
{
protected function extension(): string
{
return 'docx';
}

protected function extractText(): string
{
$doc = $this->readPart('word/document.xml');
if ($doc === null) {
throw new ExtractionException('Not a valid DOCX file: missing word/document.xml');
}

$parts = [
$this->extractDocxText($doc),
];

foreach ($this->listParts('word/header') as $headerPath) {
if (str_ends_with($headerPath, '.xml')) {
$xml = $this->readPart($headerPath);
if ($xml !== null) {
$parts[] = $this->extractDocxText($xml);
}
}
}
foreach ($this->listParts('word/footer') as $footerPath) {
if (str_ends_with($footerPath, '.xml')) {
$xml = $this->readPart($footerPath);
if ($xml !== null) {
$parts[] = $this->extractDocxText($xml);
}
}
}

return trim(implode("\n", array_filter($parts, fn ($p) => $p !== '')));
}

private function extractDocxText(string $xml): string
{
return $this->extractTextFromXml(
$xml,
textElement: 't',
blockElements: ['p', 'br'],
tabElements: ['tab'],
);
}
}
34 changes: 34 additions & 0 deletions src/Extractor/PdfExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?php

declare(strict_types=1);

namespace Splitbrain\DocExtract\Extractor;

use Smalot\PdfParser\Parser;
use Splitbrain\DocExtract\Exception\ExtractionException;
use Splitbrain\DocExtract\Extractor;

final class PdfExtractor implements Extractor
{
public function supports(string $path): bool
{
return strtolower(pathinfo($path, PATHINFO_EXTENSION)) === 'pdf';
}

public function extract(string $path): string
{
if (!is_file($path)) {
throw new ExtractionException("File not found: $path");
}
try {
$pdf = (new Parser())->parseFile($path);
return trim($pdf->getText());
} catch (\Throwable $e) {
throw new ExtractionException(
"Failed to extract text from $path: " . $e->getMessage(),
0,
$e,
);
}
}
}
Loading