Skip to content

Commit

Permalink
Strip zero-width non-joiners to prevent output issues
Browse files Browse the repository at this point in the history
Based on feature request in html2text_ruby: soundasleep/html2text_ruby#5
  • Loading branch information
soundasleep committed Feb 15, 2019
1 parent 9116a55 commit 45f1078
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Zero-width non-joiners are now stripped to prevent output issues, similar to non-breaking whitespace

### Fixed
- Fix namespace in composer [#67](https://github.com/soundasleep/html2text/pull/67)

Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
html2text [![Build Status](https://travis-ci.org/soundasleep/html2text.svg?branch=master)](https://travis-ci.org/soundasleep/html2text) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text)
=========

html2text is a very simple script that uses PHP's DOM methods to load from HTML, and then iterates over the resulting DOM to correctly output plain text. For example:
html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be
rendered by a browser - perfect for places where you need a quick text representation. For example:

```html
<html>
Expand Down
29 changes: 25 additions & 4 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ static function nbspCodes() {
);
}

static function zwnjCodes() {
return array(
"\xe2\x80\x8c",
"\u200c",
);
}

/**
* Remove leading or trailing spaces and excess empty lines from provided multiline text
*
Expand All @@ -108,7 +115,7 @@ static function processWhitespaceNewlines($text) {
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = str_replace(static::nbspCodes(), " ", $text);
$text = static::renderText($text);

// remove trailing whitespace
$text = rtrim($text);
Expand Down Expand Up @@ -178,8 +185,22 @@ static function isOfficeDocument($html) {
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}

/**
* Replace any special characters with simple text versions, to prevent output issues:
* - Convert non-breaking spaces to regular spaces; and
* - Convert zero-width non-joiners to '' (nothing).
*
* This is to match our goal of rendering documents as they would be rendered
* by a browser.
*/
static function renderText($text) {
$text = str_replace(static::nbspCodes(), " ", $text);
$text = str_replace(static::zwnjCodes(), "", $text);
return $text;
}

static function isWhitespace($text) {
return strlen(trim(str_replace(static::nbspCodes(), " ", $text), "\n\r\t ")) === 0;
return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
}

static function nextChildName($node) {
Expand Down Expand Up @@ -211,7 +232,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim(str_replace(static::nbspCodes(), " ", $node->wholeText), "\n\r\t ") . "\n";
$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";

// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
Expand All @@ -220,7 +241,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
return str_replace("\n", "\r", $text);

} else {
$text = str_replace(static::nbspCodes(), " ", $node->wholeText);
$text = static::renderText($node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);

if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
Expand Down
4 changes: 4 additions & 0 deletions tests/Html2TextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ function testHugeMsoffice() {
$this->doTest("huge-msoffice");
}

function testZeroWidthNonJoiners() {
$this->doTest("zero-width-non-joiners");
}

/**
* @expectedException PHPUnit\Framework\Error\Warning
*/
Expand Down
1 change: 1 addition & 0 deletions tests/zero-width-non-joiners.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<p>foo&zwnj;bar</p>
1 change: 1 addition & 0 deletions tests/zero-width-non-joiners.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
foobar

0 comments on commit 45f1078

Please sign in to comment.