-
Notifications
You must be signed in to change notification settings - Fork 821
/
URLSegmentFilter.php
136 lines (117 loc) · 3.72 KB
/
URLSegmentFilter.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
<?php
/**
* @package framework
* @subpackage model
*/
/**
* Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
* Uses {@link Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
* Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
*
* Caution: Should not be used on full URIs with domains or query parameters.
* In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
*
* See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
*/
class URLSegmentFilter extends Object {
/**
* @var Boolean
*/
static $default_use_transliterator = true;
/**
* @var Array See {@link setReplacements()}.
*/
static $default_replacements = array(
'/&/u' => '-and-',
'/&/u' => '-and-',
'/\s/u' => '-', // remove whitespace
'/_/u' => '-', // underscores to dashes
'/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot
'/[\-]{2,}/u' => '-', // remove duplicate dashes
'/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores
);
/**
* Doesn't try to replace or transliterate non-ASCII filters.
* Useful for character sets that have little overlap with ASCII (e.g. far eastern),
* as well as better search engine optimization for URLs.
* @see http://www.ietf.org/rfc/rfc3987
*
* @var boolean
*/
static $default_allow_multibyte = false;
/**
* @var Array See {@link setReplacements()}
*/
public $replacements = array();
/**
* Note: Depending on the applied replacement rules, this method might result in an empty string.
*
* @param String URL path (without domain or query parameters), in utf8 encoding
* @return String A filtered path compatible with RFC 3986
*/
function filter($name) {
if(!$this->getAllowMultibyte()) {
// Only transliterate when no multibyte support is requested
$transliterator = $this->getTransliterator();
if($transliterator) $name = $transliterator->toASCII($name);
}
$name = mb_strtolower($name);
$replacements = $this->getReplacements();
// Unset automated removal of non-ASCII characters, and don't try to transliterate
if($this->getAllowMultibyte() && isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']);
foreach($replacements as $regex => $replace) {
$name = preg_replace($regex, $replace, $name);
}
// Multibyte URLs require percent encoding to comply to RFC 3986.
// Without this setting, the "remove non-ASCII chars" regex takes care of that.
if($this->getAllowMultibyte()) $name = rawurlencode($name);
return $name;
}
/**
* @param Array Map of find/replace used for preg_replace().
*/
function setReplacements($r) {
$this->replacements = $r;
}
/**
* @return Array
*/
function getReplacements() {
return ($this->replacements) ? $this->replacements : self::$default_replacements;
}
/**
* @var Transliterator
*/
protected $transliterator;
/**
* @return Transliterator|NULL
*/
function getTransliterator() {
if($this->transliterator === null && self::$default_use_transliterator) {
$this->transliterator = Transliterator::create();
}
return $this->transliterator;
}
/**
* @param Transliterator|FALSE
*/
function setTransliterator($t) {
$this->transliterator = $t;
}
/**
* @var boolean
*/
protected $allowMultibyte;
/**
* @param boolean
*/
function setAllowMultibyte($bool) {
$this->allowMultibyte = $bool;
}
/**
* @return boolean
*/
function getAllowMultibyte() {
return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte;
}
}