Permalink
Browse files

ENHANCEMENT More flexible URL filtering through new URLSegmentFilter …

…API. Support for multibyte URL segments through URLPathFilter::$default_allow_multibyte. Abstraction from Convert::raw2url() (and SiteTree->generateURLSegment())
  • Loading branch information...
1 parent 4a2fe98 commit 9b27a4c1be23fdfcdcac84597c337b80718443ba @chillu chillu committed Nov 14, 2011
View
@@ -349,21 +349,13 @@ static function raw2mailto($data) {
/**
* Convert a string (normally a title) to a string suitable for using in
- * urls and other html attributes
+ * urls and other html attributes. Uses {@link URLSegmentFilter}.
*
* @param string
- *
* @return string
*/
public static function raw2url($title) {
- $t = (function_exists('mb_strtolower')) ? mb_strtolower($title) : strtolower($title);
- $t = Object::create('Transliterator')->toASCII($t);
- $t = str_replace('&','-and-',$t);
- $t = str_replace('&','-and-',$t);
- $t = ereg_replace('[^A-Za-z0-9]+','-',$t);
- $t = ereg_replace('-+','-',$t);
- $t = trim($t, '-');
-
- return $t;
+ $f = Object::create('URLSegmentFilter');
+ return $f->filter($title);
}
}
View
@@ -75,6 +75,20 @@ This means all formats are defined in
[http://framework.zend.com/manual/en/zend.date.constants.html#zend.date.constants.selfdefinedformats](ISO date format),
not PHP's built-in [date()](http://nz.php.net/manual/en/function.date.php).
+### i18n in URLs
+
+By default, URLs for pages in SilverStripe (the `SiteTree->URLSegment` property)
+are automatically reduced to the allowed allowed subset of ASCII characters.
+If characters outside this subsetare added, they are either removed or (if possible) "transliterated".
+This describes the process of converting from one character set to another
+while keeping characters recognizeable. For example, vowels with french accents
+are replaced with their base characters, `pâté` becomes `pate`.
+
+In order to allow for so called "multibyte" characters outside of the ASCII subset,
+limit the character filtering in the underlying class: `URLSegmentFilter::$default_use_transliterator = false`
+
+Please refer to [W3C: Introduction to IDN and IRI](http://www.w3.org/International/articles/idn-and-iri/) for more details.
+
### i18n in Form Fields
Date- and time related form fields support i18n ([api:DateField], [api:TimeField], [api:DatetimeField]).
@@ -23,6 +23,8 @@
* FileNameFilter::$default_use_transliterator = false;
* FileNameFilter::$default_replacements = array();
* </code>
+ *
+ * See {@link URLSegmentFilter} for a more generic implementation.
*/
class FileNameFilter {
View
@@ -0,0 +1,137 @@
+<?php
+/**
+ * @package sapphire
+ * @subpackage model
+ */
+
+/**
+ * Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
+ * Uses {@link Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
+ * Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
+ *
+ * Caution: Should not be used on full URIs with domains or query parameters.
+ * In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
+ *
+ * See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
+ */
+class URLSegmentFilter {
+
+ /**
+ * Necessary to support {@link Object::create()}
+ */
+ function __construct() {}
+
+ /**
+ * @var Boolean
+ */
+ static $default_use_transliterator = true;
+
+ /**
+ * @var Array See {@link setReplacements()}.
+ */
+ static $default_replacements = array(
+ '/&amp;/u' => '-and-',
+ '/&/u' => '-and-',
+ '/\s/u' => '-', // remove whitespace
+ '/_/u' => '-', // underscores to dashes
+ '/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot
+ '/[\-]{2,}/u' => '-', // remove duplicate dashes
+ '/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores
+ );
+
+ /**
+ * Doesn't try to replace or transliterate non-ASCII filters.
+ * Useful for character sets that have little overlap with ASCII (e.g. far eastern),
+ * as well as better search engine optimization for URLs.
+ * @see http://www.ietf.org/rfc/rfc3987
+ *
+ * @var boolean
+ */
+ static $default_allow_multibyte = false;
+
+ /**
+ * @var Array See {@link setReplacements()}
+ */
+ public $replacements = array();
+
+ /**
+ * Note: Depending on the applied replacement rules, this method might result in an empty string.
+ *
+ * @param String URL path (without domain or query parameters), in utf8 encoding
+ * @return String A filtered path compatible with RFC 3986
+ */
+ function filter($name) {
+ if(!$this->getAllowMultibyte()) {
+ // Only transliterate when no multibyte support is requested
+ $transliterator = $this->getTransliterator();
+ if($transliterator) $name = $transliterator->toASCII($name);
+ }
+
+ $name = (function_exists('mb_strtolower')) ? mb_strtolower($name) : strtolower($name);
+ $replacements = $this->getReplacements();
+ if($this->getAllowMultibyte()) {
+ // unset automated removal of non-ASCII characters, and don't try to transliterate
+ if(isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']);
+ }
+ foreach($replacements as $regex => $replace) {
+ $name = preg_replace($regex, $replace, $name);
+ }
+
+ return $name;
+ }
+
+ /**
+ * @param Array Map of find/replace used for preg_replace().
+ */
+ function setReplacements($r) {
+ $this->replacements = $r;
+ }
+
+ /**
+ * @return Array
+ */
+ function getReplacements() {
+ return ($this->replacements) ? $this->replacements : self::$default_replacements;
+ }
+
+ /**
+ * @var Transliterator
+ */
+ protected $transliterator;
+
+ /**
+ * @return Transliterator|NULL
+ */
+ function getTransliterator() {
+ if($this->transliterator === null && self::$default_use_transliterator) {
+ $this->transliterator = Object::create('Transliterator');
+ }
+ return $this->transliterator;
+ }
+
+ /**
+ * @param Transliterator|FALSE
+ */
+ function setTransliterator($t) {
+ $this->transliterator = $t;
+ }
+
+ /**
+ * @var boolean
+ */
+ protected $allowMultibyte;
+
+ /**
+ * @param boolean
+ */
+ function setAllowMultibyte($bool) {
+ $this->allowMultibyte = $bool;
+ }
+
+ /**
+ * @return boolean
+ */
+ function getAllowMultibyte() {
+ return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte;
+ }
+}
@@ -104,7 +104,7 @@ function testRaw2URL() {
$this->assertEquals('foo', Convert::raw2url('foo'));
$this->assertEquals('foo-and-bar', Convert::raw2url('foo & bar'));
$this->assertEquals('foo-and-bar', Convert::raw2url('foo &amp; bar!'));
- $this->assertEquals('foo-s-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
+ $this->assertEquals('foos-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
}
}
@@ -8,6 +8,7 @@ class FileNameFilterTest extends SapphireTest {
function testFilter() {
$name = 'Brötchen für allë-mit_Unterstrich!.jpg';
$filter = new FileNameFilter();
+ $filter->setTransliterator(false);
$this->assertEquals(
'Brtchen-fr-all-mit-Unterstrich.jpg',
$filter->filter($name)
@@ -27,6 +28,7 @@ function testFilterWithTransliterator() {
function testFilterWithCustomRules() {
$name = 'Brötchen für allë-mit_Unterstrich!.jpg';
$filter = new FileNameFilter();
+ $filter->setTransliterator(false);
$filter->setReplacements(array('/[\s-]/' => '_'));
$this->assertEquals(
'Brötchen__für_allë_mit_Unterstrich!.jpg',
@@ -0,0 +1,35 @@
+<?php
+/**
+ * @package sapphire
+ * @subpackage tests
+ */
+class URLSegmentFilterTest extends SapphireTest {
+
+ function testReplacesCommonEnglishSymbols() {
+ $f = new URLSegmentFilter();
+ $f->setAllowMultibyte(false);
+ $this->assertEquals(
+ 'john-and-spencer',
+ $f->filter('John & Spencer')
+ );
+ }
+
+ function testTransliteratesNonAsciiUrls() {
+ $f = new URLSegmentFilter();
+ $f->setAllowMultibyte(false);
+ $this->assertEquals(
+ 'broetchen',
+ $f->filter('Brötchen')
+ );
+ }
+
+ function testRetainsNonAsciiUrlsWithAllowMultiByteOption() {
+ $f = new URLSegmentFilter();
+ $f->setAllowMultibyte(true);
+ $this->assertEquals(
+ 'brötchen',
+ $f->filter('Brötchen')
+ );
+ }
+
+}

0 comments on commit 9b27a4c

Please sign in to comment.