Skip to content
This repository
Browse code

Update IRI percent-normalisation code.

Fixes a few tests, see https://gist.github.com/1187856 for before/after
  • Loading branch information...
commit 4e022de1271dda53c3c5bcec8537554b0eb5240b 1 parent 039a8c3
Ryan McCue authored September 02, 2011

Showing 1 changed file with 243 additions and 36 deletions. Show diff stats Hide diff stats

  1. 279  SimplePie/IRI.php
279  SimplePie/IRI.php
@@ -43,7 +43,6 @@
43 43
  * @todo phpDoc comments
44 44
  */
45 45
 
46  
-
47 46
 /**
48 47
  * IRI parser/serialiser
49 48
  *
@@ -309,13 +308,13 @@ public function remove_dot_segments($input)
309 308
 	/**
310 309
 	 * Replace invalid character with percent encoding
311 310
 	 *
312  
-	 * @access private
313 311
 	 * @param string $string Input string
314  
-	 * @param string $valid_chars Valid characters
  312
+	 * @param string $valid_chars Valid characters not in iunreserved or iprivate (this is ASCII-only)
315 313
 	 * @param int $case Normalise case
  314
+	 * @param bool $iprivate Allow iprivate
316 315
 	 * @return string
317 316
 	 */
318  
-	public function replace_invalid_with_pct_encoding($string, $valid_chars, $case = SIMPLEPIE_SAME_CASE)
  317
+	protected function replace_invalid_with_pct_encoding($string, $valid_chars, $case = SIMPLEPIE_SAME_CASE, $iprivate = false)
319 318
 	{
320 319
 		// Normalise case
321 320
 		if ($case & SIMPLEPIE_LOWERCASE)
@@ -327,61 +326,269 @@ public function replace_invalid_with_pct_encoding($string, $valid_chars, $case =
327 326
 			$string = strtoupper($string);
328 327
 		}
329 328
 
330  
-		// Store position and string length (to avoid constantly recalculating this)
  329
+		// Normalize as many pct-encoded sections as possible
  330
+		$string = preg_replace_callback('/(?:%[A-Fa-f0-9]{2})+/', array(&$this, 'remove_iunreserved_percent_encoded'), $string);
  331
+
  332
+		// Replace invalid percent characters
  333
+		$string = preg_replace('/%(?![A-Fa-f0-9]{2})/', '%25', $string);
  334
+
  335
+		// Add unreserved and % to $valid_chars (the latter is safe because all
  336
+		// pct-encoded sections are now valid).
  337
+		$valid_chars .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~%';
  338
+
  339
+		// Now replace any bytes that aren't allowed with their pct-encoded versions
331 340
 		$position = 0;
332 341
 		$strlen = strlen($string);
333  
-
334  
-		// Loop as long as we have invalid characters, advancing the position to the next invalid character
335 342
 		while (($position += strspn($string, $valid_chars, $position)) < $strlen)
336 343
 		{
337  
-			// If we have a % character
338  
-			if ($string[$position] === '%')
  344
+			$value = ord($string[$position]);
  345
+
  346
+			// Start position
  347
+			$start = $position;
  348
+
  349
+			// By default we are valid
  350
+			$valid = true;
  351
+
  352
+			// No one byte sequences are valid due to the while.
  353
+			// Two byte sequence:
  354
+			if (($value & 0xE0) === 0xC0)
339 355
 			{
340  
-				// If we have a pct-encoded section
341  
-				if ($position + 2 < $strlen && strspn($string, '0123456789ABCDEFabcdef', $position + 1, 2) === 2)
342  
-				{
343  
-					// Get the the represented character
344  
-					$chr = chr(hexdec(substr($string, $position + 1, 2)));
  356
+				$character = ($value & 0x1F) << 6;
  357
+				$length = 2;
  358
+				$remaining = 1;
  359
+			}
  360
+			// Three byte sequence:
  361
+			elseif (($value & 0xF0) === 0xE0)
  362
+			{
  363
+				$character = ($value & 0x0F) << 12;
  364
+				$length = 3;
  365
+				$remaining = 2;
  366
+			}
  367
+			// Four byte sequence:
  368
+			elseif (($value & 0xF8) === 0xF0)
  369
+			{
  370
+				$character = ($value & 0x07) << 18;
  371
+				$length = 4;
  372
+				$remaining = 3;
  373
+			}
  374
+			// Invalid byte:
  375
+			else
  376
+			{
  377
+				$valid = false;
  378
+				$length = 1;
  379
+				$remaining = 0;
  380
+			}
345 381
 
346  
-					// If the character is valid, replace the pct-encoded with the actual character while normalising case
347  
-					if (strpos($valid_chars, $chr) !== false)
  382
+			if ($remaining)
  383
+			{
  384
+				if ($position + $length <= $strlen)
  385
+				{
  386
+					for ($position++; $remaining; $position++)
348 387
 					{
349  
-						if ($case & SIMPLEPIE_LOWERCASE)
  388
+						$value = ord($string[$position]);
  389
+
  390
+						// Check that the byte is valid, then add it to the character:
  391
+						if (($value & 0xC0) === 0x80)
350 392
 						{
351  
-							$chr = strtolower($chr);
  393
+							$character |= ($value & 0x3F) << (--$remaining * 6);
352 394
 						}
353  
-						elseif ($case & SIMPLEPIE_UPPERCASE)
  395
+						// If it is invalid, count the sequence as invalid and reprocess the current byte:
  396
+						else
354 397
 						{
355  
-							$chr = strtoupper($chr);
  398
+							$valid = false;
  399
+							$position--;
  400
+							break;
356 401
 						}
357  
-						$string = substr_replace($string, $chr, $position, 3);
358  
-						$strlen -= 2;
359  
-						$position++;
360 402
 					}
  403
+				}
  404
+				else
  405
+				{
  406
+					$position = $strlen - 1;
  407
+					$valid = false;
  408
+				}
  409
+			}
361 410
 
362  
-					// Otherwise just normalise the pct-encoded to uppercase
363  
-					else
  411
+			// Percent encode anything invalid or not in ucschar
  412
+			if (
  413
+				// Invalid sequences
  414
+				!$valid
  415
+				// Non-shortest form sequences are invalid
  416
+				|| $length > 1 && $character <= 0x7F
  417
+				|| $length > 2 && $character <= 0x7FF
  418
+				|| $length > 3 && $character <= 0xFFFF
  419
+				// Outside of range of ucschar codepoints
  420
+				// Noncharacters
  421
+				|| ($character & 0xFFFE) === 0xFFFE
  422
+				|| $character >= 0xFDD0 && $character <= 0xFDEF
  423
+				|| (
  424
+					// Everything else not in ucschar
  425
+					   $character > 0xD7FF && $character < 0xF900
  426
+					|| $character < 0xA0
  427
+					|| $character > 0xEFFFD
  428
+				)
  429
+				&& (
  430
+					// Everything not in iprivate, if it applies
  431
+					   !$iprivate
  432
+					|| $character < 0xE000
  433
+					|| $character > 0x10FFFD
  434
+				)
  435
+			)
  436
+			{
  437
+				// If we were a character, pretend we weren't, but rather an error.
  438
+				if ($valid)
  439
+					$position--;
  440
+
  441
+				for ($j = $start; $j <= $position; $j++)
  442
+				{
  443
+					$string = substr_replace($string, sprintf('%%%02X', ord($string[$j])), $j, 1);
  444
+					$j += 2;
  445
+					$position += 2;
  446
+					$strlen += 2;
  447
+				}
  448
+			}
  449
+		}
  450
+
  451
+		return $string;
  452
+	}
  453
+
  454
+	/**
  455
+	 * Callback function for preg_replace_callback.
  456
+	 *
  457
+	 * Removes sequences of percent encoded bytes that represent UTF-8
  458
+	 * encoded characters in iunreserved
  459
+	 *
  460
+	 * @param array $match PCRE match
  461
+	 * @return string Replacement
  462
+	 */
  463
+	protected function remove_iunreserved_percent_encoded($match)
  464
+	{
  465
+		// As we just have valid percent encoded sequences we can just explode
  466
+		// and ignore the first member of the returned array (an empty string).
  467
+		$bytes = explode('%', $match[0]);
  468
+
  469
+		// Initialize the new string (this is what will be returned) and that
  470
+		// there are no bytes remaining in the current sequence (unsurprising
  471
+		// at the first byte!).
  472
+		$string = '';
  473
+		$remaining = 0;
  474
+
  475
+		// Loop over each and every byte, and set $value to its value
  476
+		for ($i = 1, $len = count($bytes); $i < $len; $i++)
  477
+		{
  478
+			$value = hexdec($bytes[$i]);
  479
+
  480
+			// If we're the first byte of sequence:
  481
+			if (!$remaining)
  482
+			{
  483
+				// Start position
  484
+				$start = $i;
  485
+
  486
+				// By default we are valid
  487
+				$valid = true;
  488
+
  489
+				// One byte sequence:
  490
+				if ($value <= 0x7F)
  491
+				{
  492
+					$character = $value;
  493
+					$length = 1;
  494
+				}
  495
+				// Two byte sequence:
  496
+				elseif (($value & 0xE0) === 0xC0)
  497
+				{
  498
+					$character = ($value & 0x1F) << 6;
  499
+					$length = 2;
  500
+					$remaining = 1;
  501
+				}
  502
+				// Three byte sequence:
  503
+				elseif (($value & 0xF0) === 0xE0)
  504
+				{
  505
+					$character = ($value & 0x0F) << 12;
  506
+					$length = 3;
  507
+					$remaining = 2;
  508
+				}
  509
+				// Four byte sequence:
  510
+				elseif (($value & 0xF8) === 0xF0)
  511
+				{
  512
+					$character = ($value & 0x07) << 18;
  513
+					$length = 4;
  514
+					$remaining = 3;
  515
+				}
  516
+				// Invalid byte:
  517
+				else
  518
+				{
  519
+					$valid = false;
  520
+					$remaining = 0;
  521
+				}
  522
+			}
  523
+			// Continuation byte:
  524
+			else
  525
+			{
  526
+				// Check that the byte is valid, then add it to the character:
  527
+				if (($value & 0xC0) === 0x80)
  528
+				{
  529
+					$remaining--;
  530
+					$character |= ($value & 0x3F) << ($remaining * 6);
  531
+				}
  532
+				// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
  533
+				else
  534
+				{
  535
+					$valid = false;
  536
+					$remaining = 0;
  537
+					$i--;
  538
+				}
  539
+			}
  540
+
  541
+			// If we've reached the end of the current byte sequence, append it to Unicode::$data
  542
+			if (!$remaining)
  543
+			{
  544
+				// Percent encode anything invalid or not in iunreserved
  545
+				if (
  546
+					// Invalid sequences
  547
+					!$valid
  548
+					// Non-shortest form sequences are invalid
  549
+					|| $length > 1 && $character <= 0x7F
  550
+					|| $length > 2 && $character <= 0x7FF
  551
+					|| $length > 3 && $character <= 0xFFFF
  552
+					// Outside of range of iunreserved codepoints
  553
+					|| $character < 0x2D
  554
+					|| $character > 0xEFFFD
  555
+					// Noncharacters
  556
+					|| ($character & 0xFFFE) === 0xFFFE
  557
+					|| $character >= 0xFDD0 && $character <= 0xFDEF
  558
+					// Everything else not in iunreserved (this is all BMP)
  559
+					|| $character === 0x2F
  560
+					|| $character > 0x39 && $character < 0x41
  561
+					|| $character > 0x5A && $character < 0x61
  562
+					|| $character > 0x7A && $character < 0x7E
  563
+					|| $character > 0x7E && $character < 0xA0
  564
+					|| $character > 0xD7FF && $character < 0xF900
  565
+				)
  566
+				{
  567
+					for ($j = $start; $j <= $i; $j++)
364 568
 					{
365  
-						$string = substr_replace($string, strtoupper(substr($string, $position + 1, 2)), $position + 1, 2);
366  
-						$position += 3;
  569
+						$string .= '%' . strtoupper($bytes[$j]);
367 570
 					}
368 571
 				}
369  
-				// If we don't have a pct-encoded section, just replace the % with its own esccaped form
370 572
 				else
371 573
 				{
372  
-					$string = substr_replace($string, '%25', $position, 1);
373  
-					$strlen += 2;
374  
-					$position += 3;
  574
+					for ($j = $start; $j <= $i; $j++)
  575
+					{
  576
+						$string .= chr(hexdec($bytes[$j]));
  577
+					}
375 578
 				}
376 579
 			}
377  
-			// If we have an invalid character, change into its pct-encoded form
378  
-			else
  580
+		}
  581
+
  582
+		// If we have any bytes left over they are invalid (i.e., we are
  583
+		// mid-way through a multi-byte sequence)
  584
+		if ($remaining)
  585
+		{
  586
+			for ($j = $start; $j < $len; $j++)
379 587
 			{
380  
-				$replacement = sprintf("%%%02X", ord($string[$position]));
381  
-				$string = str_replace($string[$position], $replacement, $string);
382  
-				$strlen = strlen($string);
  588
+				$string .= '%' . strtoupper($bytes[$j]);
383 589
 			}
384 590
 		}
  591
+
385 592
 		return $string;
386 593
 	}
387 594
 

0 notes on commit 4e022de

Please sign in to comment.
Something went wrong with that request. Please try again.