Skip to content
This repository
Browse code

Update IRI percent-normalisation code.

Fixes a few tests, see https://gist.github.com/1187856 for before/after

Backport of 4e022de for one-dot-two. Fixes #108, fixes #112, maybe fixes #58.
  • Loading branch information...
commit 76b5fd632f40c4516d68f3f1bdabcd76829117cc 1 parent bd36f83
Ryan McCue authored September 19, 2011

Showing 1 changed file with 250 additions and 42 deletions. Show diff stats Hide diff stats

  1. 292  simplepie.inc
292  simplepie.inc
@@ -11921,14 +11921,135 @@ class SimplePie_IRI
11921 11921
 	/**
11922 11922
 	 * Replace invalid character with percent encoding
11923 11923
 	 *
11924  
-	 * @access private
11925 11924
 	 * @param string $string Input string
11926 11925
 	 * @param string $valid_chars Valid characters
11927 11926
 	 * @param int $case Normalise case
11928 11927
 	 * @return string
11929 11928
 	 */
11930  
-	function replace_invalid_with_pct_encoding($string, $valid_chars, $case = SIMPLEPIE_SAME_CASE)
  11929
+	function replace_invalid_with_pct_encoding($string, $valid_chars, $case = SIMPLEPIE_SAME_CASE, $iprivate = false)
11931 11930
 	{
  11931
+		// Normalize as many pct-encoded sections as possible
  11932
+		$string = preg_replace_callback('/(?:%[A-Fa-f0-9]{2})+/', array(&$this, 'remove_iunreserved_percent_encoded'), $string);
  11933
+
  11934
+		// Replace invalid percent characters
  11935
+		$string = preg_replace('/%(?![A-Fa-f0-9]{2})/', '%25', $string);
  11936
+
  11937
+		// Add unreserved and % to $valid_chars (the latter is safe because all
  11938
+		// pct-encoded sections are now valid).
  11939
+		$valid_chars .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~%';
  11940
+
  11941
+		// Now replace any bytes that aren't allowed with their pct-encoded versions
  11942
+		$position = 0;
  11943
+		$strlen = strlen($string);
  11944
+		while (($position += strspn($string, $valid_chars, $position)) < $strlen)
  11945
+		{
  11946
+			$value = ord($string[$position]);
  11947
+
  11948
+			// Start position
  11949
+			$start = $position;
  11950
+
  11951
+			// By default we are valid
  11952
+			$valid = true;
  11953
+
  11954
+			// No one byte sequences are valid due to the while.
  11955
+			// Two byte sequence:
  11956
+			if (($value & 0xE0) === 0xC0)
  11957
+			{
  11958
+				$character = ($value & 0x1F) << 6;
  11959
+				$length = 2;
  11960
+				$remaining = 1;
  11961
+			}
  11962
+			// Three byte sequence:
  11963
+			elseif (($value & 0xF0) === 0xE0)
  11964
+			{
  11965
+				$character = ($value & 0x0F) << 12;
  11966
+				$length = 3;
  11967
+				$remaining = 2;
  11968
+			}
  11969
+			// Four byte sequence:
  11970
+			elseif (($value & 0xF8) === 0xF0)
  11971
+			{
  11972
+				$character = ($value & 0x07) << 18;
  11973
+				$length = 4;
  11974
+				$remaining = 3;
  11975
+			}
  11976
+			// Invalid byte:
  11977
+			else
  11978
+			{
  11979
+				$valid = false;
  11980
+				$length = 1;
  11981
+				$remaining = 0;
  11982
+			}
  11983
+
  11984
+			if ($remaining)
  11985
+			{
  11986
+				if ($position + $length <= $strlen)
  11987
+				{
  11988
+					for ($position++; $remaining; $position++)
  11989
+					{
  11990
+						$value = ord($string[$position]);
  11991
+
  11992
+						// Check that the byte is valid, then add it to the character:
  11993
+						if (($value & 0xC0) === 0x80)
  11994
+						{
  11995
+							$character |= ($value & 0x3F) << (--$remaining * 6);
  11996
+						}
  11997
+						// If it is invalid, count the sequence as invalid and reprocess the current byte:
  11998
+						else
  11999
+						{
  12000
+							$valid = false;
  12001
+							$position--;
  12002
+							break;
  12003
+						}
  12004
+					}
  12005
+				}
  12006
+				else
  12007
+				{
  12008
+					$position = $strlen - 1;
  12009
+					$valid = false;
  12010
+				}
  12011
+			}
  12012
+
  12013
+			// Percent encode anything invalid or not in ucschar
  12014
+			if (
  12015
+				// Invalid sequences
  12016
+				!$valid
  12017
+				// Non-shortest form sequences are invalid
  12018
+				|| $length > 1 && $character <= 0x7F
  12019
+				|| $length > 2 && $character <= 0x7FF
  12020
+				|| $length > 3 && $character <= 0xFFFF
  12021
+				// Outside of range of ucschar codepoints
  12022
+				// Noncharacters
  12023
+				|| ($character & 0xFFFE) === 0xFFFE
  12024
+				|| $character >= 0xFDD0 && $character <= 0xFDEF
  12025
+				|| (
  12026
+					// Everything else not in ucschar
  12027
+					   $character > 0xD7FF && $character < 0xF900
  12028
+					|| $character < 0xA0
  12029
+					|| $character > 0xEFFFD
  12030
+				)
  12031
+				&& (
  12032
+					// Everything not in iprivate, if it applies
  12033
+					   !$iprivate
  12034
+					|| $character < 0xE000
  12035
+					|| $character > 0x10FFFD
  12036
+				)
  12037
+			)
  12038
+			{
  12039
+				// If we were a character, pretend we weren't, but rather an error.
  12040
+				if ($valid)
  12041
+					$position--;
  12042
+
  12043
+				for ($j = $start; $j <= $position; $j++)
  12044
+				{
  12045
+					$string = substr_replace($string, sprintf('%%%02X', ord($string[$j])), $j, 1);
  12046
+					$j += 2;
  12047
+					$position += 2;
  12048
+					$strlen += 2;
  12049
+				}
  12050
+			}
  12051
+		}
  12052
+
11932 12053
 		// Normalise case
11933 12054
 		if ($case & SIMPLEPIE_LOWERCASE)
11934 12055
 		{
@@ -11939,61 +12060,148 @@ class SimplePie_IRI
11939 12060
 			$string = strtoupper($string);
11940 12061
 		}
11941 12062
 
11942  
-		// Store position and string length (to avoid constantly recalculating this)
11943  
-		$position = 0;
11944  
-		$strlen = strlen($string);
  12063
+		return $string;
  12064
+	}
11945 12065
 
11946  
-		// Loop as long as we have invalid characters, advancing the position to the next invalid character
11947  
-		while (($position += strspn($string, $valid_chars, $position)) < $strlen)
  12066
+	/**
  12067
+	 * Callback function for preg_replace_callback.
  12068
+	 *
  12069
+	 * Removes sequences of percent encoded bytes that represent UTF-8
  12070
+	 * encoded characters in iunreserved
  12071
+	 *
  12072
+	 * @access private
  12073
+	 * @param array $match PCRE match
  12074
+	 * @return string Replacement
  12075
+	 */
  12076
+	function remove_iunreserved_percent_encoded($match)
  12077
+	{
  12078
+		// As we just have valid percent encoded sequences we can just explode
  12079
+		// and ignore the first member of the returned array (an empty string).
  12080
+		$bytes = explode('%', $match[0]);
  12081
+
  12082
+		// Initialize the new string (this is what will be returned) and that
  12083
+		// there are no bytes remaining in the current sequence (unsurprising
  12084
+		// at the first byte!).
  12085
+		$string = '';
  12086
+		$remaining = 0;
  12087
+
  12088
+		// Loop over each and every byte, and set $value to its value
  12089
+		for ($i = 1, $len = count($bytes); $i < $len; $i++)
11948 12090
 		{
11949  
-			// If we have a % character
11950  
-			if ($string[$position] === '%')
  12091
+			$value = hexdec($bytes[$i]);
  12092
+
  12093
+			// If we're the first byte of sequence:
  12094
+			if (!$remaining)
11951 12095
 			{
11952  
-				// If we have a pct-encoded section
11953  
-				if ($position + 2 < $strlen && strspn($string, '0123456789ABCDEFabcdef', $position + 1, 2) === 2)
11954  
-				{
11955  
-					// Get the the represented character
11956  
-					$chr = chr(hexdec(substr($string, $position + 1, 2)));
  12096
+				// Start position
  12097
+				$start = $i;
11957 12098
 
11958  
-					// If the character is valid, replace the pct-encoded with the actual character while normalising case
11959  
-					if (strpos($valid_chars, $chr) !== false)
11960  
-					{
11961  
-						if ($case & SIMPLEPIE_LOWERCASE)
11962  
-						{
11963  
-							$chr = strtolower($chr);
11964  
-						}
11965  
-						elseif ($case & SIMPLEPIE_UPPERCASE)
11966  
-						{
11967  
-							$chr = strtoupper($chr);
11968  
-						}
11969  
-						$string = substr_replace($string, $chr, $position, 3);
11970  
-						$strlen -= 2;
11971  
-						$position++;
11972  
-					}
  12099
+				// By default we are valid
  12100
+				$valid = true;
11973 12101
 
11974  
-					// Otherwise just normalise the pct-encoded to uppercase
11975  
-					else
  12102
+				// One byte sequence:
  12103
+				if ($value <= 0x7F)
  12104
+				{
  12105
+					$character = $value;
  12106
+					$length = 1;
  12107
+				}
  12108
+				// Two byte sequence:
  12109
+				elseif (($value & 0xE0) === 0xC0)
  12110
+				{
  12111
+					$character = ($value & 0x1F) << 6;
  12112
+					$length = 2;
  12113
+					$remaining = 1;
  12114
+				}
  12115
+				// Three byte sequence:
  12116
+				elseif (($value & 0xF0) === 0xE0)
  12117
+				{
  12118
+					$character = ($value & 0x0F) << 12;
  12119
+					$length = 3;
  12120
+					$remaining = 2;
  12121
+				}
  12122
+				// Four byte sequence:
  12123
+				elseif (($value & 0xF8) === 0xF0)
  12124
+				{
  12125
+					$character = ($value & 0x07) << 18;
  12126
+					$length = 4;
  12127
+					$remaining = 3;
  12128
+				}
  12129
+				// Invalid byte:
  12130
+				else
  12131
+				{
  12132
+					$valid = false;
  12133
+					$remaining = 0;
  12134
+				}
  12135
+			}
  12136
+			// Continuation byte:
  12137
+			else
  12138
+			{
  12139
+				// Check that the byte is valid, then add it to the character:
  12140
+				if (($value & 0xC0) === 0x80)
  12141
+				{
  12142
+					$remaining--;
  12143
+					$character |= ($value & 0x3F) << ($remaining * 6);
  12144
+				}
  12145
+				// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
  12146
+				else
  12147
+				{
  12148
+					$valid = false;
  12149
+					$remaining = 0;
  12150
+					$i--;
  12151
+				}
  12152
+			}
  12153
+
  12154
+			// If we've reached the end of the current byte sequence, append it to Unicode::$data
  12155
+			if (!$remaining)
  12156
+			{
  12157
+				// Percent encode anything invalid or not in iunreserved
  12158
+				if (
  12159
+					// Invalid sequences
  12160
+					!$valid
  12161
+					// Non-shortest form sequences are invalid
  12162
+					|| $length > 1 && $character <= 0x7F
  12163
+					|| $length > 2 && $character <= 0x7FF
  12164
+					|| $length > 3 && $character <= 0xFFFF
  12165
+					// Outside of range of iunreserved codepoints
  12166
+					|| $character < 0x2D
  12167
+					|| $character > 0xEFFFD
  12168
+					// Noncharacters
  12169
+					|| ($character & 0xFFFE) === 0xFFFE
  12170
+					|| $character >= 0xFDD0 && $character <= 0xFDEF
  12171
+					// Everything else not in iunreserved (this is all BMP)
  12172
+					|| $character === 0x2F
  12173
+					|| $character > 0x39 && $character < 0x41
  12174
+					|| $character > 0x5A && $character < 0x61
  12175
+					|| $character > 0x7A && $character < 0x7E
  12176
+					|| $character > 0x7E && $character < 0xA0
  12177
+					|| $character > 0xD7FF && $character < 0xF900
  12178
+				)
  12179
+				{
  12180
+					for ($j = $start; $j <= $i; $j++)
11976 12181
 					{
11977  
-						$string = substr_replace($string, strtoupper(substr($string, $position + 1, 2)), $position + 1, 2);
11978  
-						$position += 3;
  12182
+						$string .= '%' . strtoupper($bytes[$j]);
11979 12183
 					}
11980 12184
 				}
11981  
-				// If we don't have a pct-encoded section, just replace the % with its own esccaped form
11982 12185
 				else
11983 12186
 				{
11984  
-					$string = substr_replace($string, '%25', $position, 1);
11985  
-					$strlen += 2;
11986  
-					$position += 3;
  12187
+					for ($j = $start; $j <= $i; $j++)
  12188
+					{
  12189
+						$string .= chr(hexdec($bytes[$j]));
  12190
+					}
11987 12191
 				}
11988 12192
 			}
11989  
-			// If we have an invalid character, change into its pct-encoded form
11990  
-			else
  12193
+		}
  12194
+
  12195
+		// If we have any bytes left over they are invalid (i.e., we are
  12196
+		// mid-way through a multi-byte sequence)
  12197
+		if ($remaining)
  12198
+		{
  12199
+			for ($j = $start; $j < $len; $j++)
11991 12200
 			{
11992  
-				$replacement = sprintf("%%%02X", ord($string[$position]));
11993  
-				$string = str_replace($string[$position], $replacement, $string);
11994  
-				$strlen = strlen($string);
  12201
+				$string .= '%' . strtoupper($bytes[$j]);
11995 12202
 			}
11996 12203
 		}
  12204
+
11997 12205
 		return $string;
11998 12206
 	}
11999 12207
 

0 notes on commit 76b5fd6

Please sign in to comment.
Something went wrong with that request. Please try again.