splitbrain · splitbrain · May 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+/vendor/
+/composer.lock
+/.phpunit.cache/
+/.phpunit.result.cache
diff --git a/composer.json b/composer.json
@@ -0,0 +1,29 @@
+{
+    "name": "splitbrain/doc-extract",
+    "description": "Lightweight PHP library to extract plain text from DOCX, XLSX, PPTX and PDF files.",
+    "type": "library",
+    "license": "MIT",
+    "require": {
+        "php": "^8.1",
+        "ext-dom": "*",
+        "ext-xmlreader": "*",
+        "splitbrain/php-archive": "^1.4.2",
+        "smalot/pdfparser": "^2.10"
+    },
+    "require-dev": {
+        "phpunit/phpunit": "^10.5"
+    },
+    "autoload": {
+        "psr-4": {
+            "Splitbrain\\DocExtract\\": "src/"
+        }
+    },
+    "autoload-dev": {
+        "psr-4": {
+            "Splitbrain\\DocExtract\\Tests\\": "tests/"
+        }
+    },
+    "config": {
+        "sort-packages": true
+    }
+}
diff --git a/phpunit.xml.dist b/phpunit.xml.dist
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
+         bootstrap="vendor/autoload.php"
+         colors="true"
+         cacheDirectory=".phpunit.cache">
+    <testsuites>
+        <testsuite name="doc-extract">
+            <directory>tests</directory>
+        </testsuite>
+    </testsuites>
+    <source>
+        <include>
+            <directory>src</directory>
+        </include>
+    </source>
+</phpunit>
diff --git a/src/Exception/ExtractionException.php b/src/Exception/ExtractionException.php
@@ -0,0 +1,9 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract\Exception;
+
+class ExtractionException extends \RuntimeException
+{
+}
diff --git a/src/Exception/UnsupportedFormatException.php b/src/Exception/UnsupportedFormatException.php
@@ -0,0 +1,9 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract\Exception;
+
+class UnsupportedFormatException extends ExtractionException
+{
+}
diff --git a/src/Extractor.php b/src/Extractor.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract;
+
+interface Extractor
+{
+    /**
+     * Extract plain text from the given file.
+     *
+     * @throws Exception\ExtractionException on I/O or parse failure
+     */
+    public function extract(string $path): string;
+
+    /**
+     * Whether this extractor can handle the given file (based on extension).
+     */
+    public function supports(string $path): bool;
+}
diff --git a/src/Extractor/AbstractOoxmlExtractor.php b/src/Extractor/AbstractOoxmlExtractor.php
@@ -0,0 +1,160 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract\Extractor;
+
+use FilesystemIterator;
+use RecursiveDirectoryIterator;
+use RecursiveIteratorIterator;
+use Splitbrain\DocExtract\Exception\ExtractionException;
+use Splitbrain\DocExtract\Extractor;
+use splitbrain\PHPArchive\Zip;
+use XMLReader;
+
+abstract class AbstractOoxmlExtractor implements Extractor
+{
+    private string $tempDir = '';
+
+    abstract protected function extension(): string;
+
+    abstract protected function extractText(): string;
+
+    public function supports(string $path): bool
+    {
+        return strtolower(pathinfo($path, PATHINFO_EXTENSION)) === $this->extension();
+    }
+
+    public function extract(string $path): string
+    {
+        if (!is_file($path)) {
+            throw new ExtractionException("File not found: $path");
+        }
+
+        $this->tempDir = $this->makeTempDir();
+        try {
+            $zip = new Zip();
+            $zip->open($path);
+            $zip->extract($this->tempDir);
+            $zip->close();
+
+            return $this->extractText();
+        } catch (ExtractionException $e) {
+            throw $e;
+        } catch (\Throwable $e) {
+            throw new ExtractionException(
+                "Failed to extract text from $path: " . $e->getMessage(),
+                0,
+                $e,
+            );
+        } finally {
+            if ($this->tempDir !== '') {
+                $this->cleanup($this->tempDir);
+                $this->tempDir = '';
+            }
+        }
+    }
+
+    protected function readPart(string $internalPath): ?string
+    {
+        $full = $this->tempDir . '/' . ltrim($internalPath, '/');
+        if (!is_file($full)) {
+            return null;
+        }
+        $data = file_get_contents($full);
+        return $data === false ? null : $data;
+    }
+
+    /**
+     * @return string[] internal paths (relative to archive root) matching the prefix
+     */
+    protected function listParts(string $prefix): array
+    {
+        if (!is_dir($this->tempDir)) {
+            return [];
+        }
+        $base = $this->tempDir . '/';
+        $results = [];
+        $it = new RecursiveIteratorIterator(
+            new RecursiveDirectoryIterator($this->tempDir, FilesystemIterator::SKIP_DOTS),
+        );
+        foreach ($it as $file) {
+            if (!$file->isFile()) {
+                continue;
+            }
+            $rel = str_replace('\\', '/', substr($file->getPathname(), strlen($base)));
+            if (str_starts_with($rel, $prefix)) {
+                $results[] = $rel;
+            }
+        }
+        sort($results, SORT_NATURAL);
+        return $results;
+    }
+
+    /**
+     * Stream-parse XML and concatenate text from elements matching $textElement.
+     * Block elements emit a newline; tab elements emit a tab.
+     */
+    protected function extractTextFromXml(
+        string $xml,
+        string $textElement,
+        array $blockElements = [],
+        array $tabElements = [],
+    ): string {
+        $reader = new XMLReader();
+        if (!$reader->XML($xml, 'UTF-8', LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
+            throw new ExtractionException('Failed to parse XML');
+        }
+        try {
+            $out = '';
+            $blocks = array_flip($blockElements);
+            $tabs = array_flip($tabElements);
+            while ($reader->read()) {
+                if ($reader->nodeType !== XMLReader::ELEMENT) {
+                    continue;
+                }
+                $local = $reader->localName;
+                if ($local === $textElement) {
+                    $out .= $reader->readString();
+                } elseif (isset($blocks[$local])) {
+                    if ($out !== '' && !str_ends_with($out, "\n")) {
+                        $out .= "\n";
+                    }
+                } elseif (isset($tabs[$local])) {
+                    $out .= "\t";
+                }
+            }
+            return $out;
+        } finally {
+            $reader->close();
+        }
+    }
+
+    private function makeTempDir(): string
+    {
+        $dir = sys_get_temp_dir() . '/doc-extract-' . bin2hex(random_bytes(8));
+        if (!@mkdir($dir, 0700, true) && !is_dir($dir)) {
+            throw new ExtractionException("Could not create temp dir: $dir");
+        }
+        return $dir;
+    }
+
+    private function cleanup(string $dir): void
+    {
+        if (!is_dir($dir)) {
+            return;
+        }
+        $it = new RecursiveIteratorIterator(
+            new RecursiveDirectoryIterator($dir, FilesystemIterator::SKIP_DOTS),
+            RecursiveIteratorIterator::CHILD_FIRST,
+        );
+        foreach ($it as $file) {
+            if ($file->isDir()) {
+                @rmdir($file->getPathname());
+            } else {
+                @unlink($file->getPathname());
+            }
+        }
+        @rmdir($dir);
+    }
+}
diff --git a/src/Extractor/DocxExtractor.php b/src/Extractor/DocxExtractor.php
@@ -0,0 +1,56 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract\Extractor;
+
+use Splitbrain\DocExtract\Exception\ExtractionException;
+
+final class DocxExtractor extends AbstractOoxmlExtractor
+{
+    protected function extension(): string
+    {
+        return 'docx';
+    }
+
+    protected function extractText(): string
+    {
+        $doc = $this->readPart('word/document.xml');
+        if ($doc === null) {
+            throw new ExtractionException('Not a valid DOCX file: missing word/document.xml');
+        }
+
+        $parts = [
+            $this->extractDocxText($doc),
+        ];
+
+        foreach ($this->listParts('word/header') as $headerPath) {
+            if (str_ends_with($headerPath, '.xml')) {
+                $xml = $this->readPart($headerPath);
+                if ($xml !== null) {
+                    $parts[] = $this->extractDocxText($xml);
+                }
+            }
+        }
+        foreach ($this->listParts('word/footer') as $footerPath) {
+            if (str_ends_with($footerPath, '.xml')) {
+                $xml = $this->readPart($footerPath);
+                if ($xml !== null) {
+                    $parts[] = $this->extractDocxText($xml);
+                }
+            }
+        }
+
+        return trim(implode("\n", array_filter($parts, fn ($p) => $p !== '')));
+    }
+
+    private function extractDocxText(string $xml): string
+    {
+        return $this->extractTextFromXml(
+            $xml,
+            textElement: 't',
+            blockElements: ['p', 'br'],
+            tabElements: ['tab'],
+        );
+    }
+}
diff --git a/src/Extractor/PdfExtractor.php b/src/Extractor/PdfExtractor.php
@@ -0,0 +1,34 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Splitbrain\DocExtract\Extractor;
+
+use Smalot\PdfParser\Parser;
+use Splitbrain\DocExtract\Exception\ExtractionException;
+use Splitbrain\DocExtract\Extractor;
+
+final class PdfExtractor implements Extractor
+{
+    public function supports(string $path): bool
+    {
+        return strtolower(pathinfo($path, PATHINFO_EXTENSION)) === 'pdf';
+    }
+
+    public function extract(string $path): string
+    {
+        if (!is_file($path)) {
+            throw new ExtractionException("File not found: $path");
+        }
+        try {
+            $pdf = (new Parser())->parseFile($path);
+            return trim($pdf->getText());
+        } catch (\Throwable $e) {
+            throw new ExtractionException(
+                "Failed to extract text from $path: " . $e->getMessage(),
+                0,
+                $e,
+            );
+        }
+    }
+}