Skip to content

Commit

Permalink
Merge pull request #13 from fruux/fallback-uri-parser
Browse files Browse the repository at this point in the history
Fallback URI parser.
  • Loading branch information
evert committed Dec 7, 2016
2 parents 55301cf + a70d23c commit 4c3dd28
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 1 deletion.
17 changes: 17 additions & 0 deletions lib/InvalidUriException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php declare (strict_types=1);

namespace Sabre\Uri;

/**
* Invalid Uri
*
* This is thrown when an attempt was made to use Sabre\Uri parse a uri that
* it could not.
*
* @copyright Copyright (C) fruux GmbH (https://fruux.com/)
* @author Evert Pot (https://evertpot.com/)
* @license http://sabre.io/license/
*/
class InvalidUriException extends \Exception {

}
93 changes: 92 additions & 1 deletion lib/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,13 @@ function($matches) {
$uri
);

$result = parse_url($uri);
if (!$result) {
$result = _parse_fallback($uri);
}

return
parse_url($uri) + [
$result + [
'scheme' => null,
'host' => null,
'path' => null,
Expand Down Expand Up @@ -280,3 +285,89 @@ function split(string $path) : array {
return [null,null];

}

/**
* This function is another implementation of parse_url, except this one is
* fully written in PHP.
*
* The reason is that the PHP bug team is not willing to admit that there are
* bugs in the parse_url implementation.
*
* This function is only called if the main parse method fails. It's pretty
* crude and probably slow, so the original parse_url is usually preferred.
*
* @param string $uri
* @return array
*/
function _parse_fallback(string $uri) : array {

// Normally a URI must be ASCII, however. However, often it's not and
// parse_url might corrupt these strings.
//
// For that reason we take any non-ascii characters from the uri and
// uriencode them first.
$uri = preg_replace_callback(
'/[^[:ascii:]]/u',
function($matches) {
return rawurlencode($matches[0]);
},
$uri
);

$result = [
'scheme' => null,
'host' => null,
'port' => null,
'user' => null,
'path' => null,
'fragment' => null,
'query' => null,
];

if (preg_match('% ^([A-Za-z][A-Za-z0-9+-\.]+): %x', $uri, $matches)) {

$result['scheme'] = $matches[1];
// Take what's left.
$uri = substr($uri, strlen($result['scheme']) + 1);

}

// Taking off a fragment part
if (strpos($uri, '#')) {
list($uri, $result['fragment']) = explode('#', $uri, 2);
}
// Taking off the query part
if (strpos($uri, '?')) {
list($uri, $result['query']) = explode('?', $uri, 2);
}

if (substr($uri, 0, 3) === '///') {
// The triple slash uris are a bit unusual, but we have special handling
// for them.
$result['path'] = substr($uri, 2);
$result['host'] = '';
} elseif (substr($uri, 0, 2) === '//') {
// Uris that have an authority part.
$regex = '
%^
//
(?: (?<user> [^:@]+) (: (?<pass> [^@]+)) @)?
(?<host> ( [^:/]* | \[ [^\]]+ \] ))
(?: : (?<port> [0-9]+))?
(?<path> / .*)?
$%x
';
if (!preg_match($regex, $uri, $matches)) {
throw new InvalidUriException('Invalid, or could not parse URI');
}
if ($matches['host']) $result['host'] = $matches['host'];
if ($matches['port']) $result['port'] = (int)$matches['port'];
if (isset($matches['path'])) $result['path'] = $matches['path'];
if ($matches['user']) $result['user'] = $matches['user'];
if ($matches['pass']) $result['pass'] = $matches['pass'];
} else {
$result['path'] = $uri;
}

return $result;
}
113 changes: 113 additions & 0 deletions tests/ParseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,29 @@ function testParse($in, $out) {

}

/**
* @dataProvider parseData
*/
function testParseFallback($in, $out) {

$result = _parse_fallback($in);
$result = $result + [
'scheme' => null,
'host' => null,
'path' => null,
'port' => null,
'user' => null,
'query' => null,
'fragment' => null,
];

$this->assertEquals(
$out,
$result
);

}

function parseData() {

return [
Expand Down Expand Up @@ -58,6 +81,96 @@ function parseData() {
'fragment' => null,
]
],
// See issue #9, parse_url doesn't like colons followed by numbers even
// though they are allowed since RFC 3986
[
'http://example.org/hello:12?foo=bar#test',
[
'scheme' => 'http',
'host' => 'example.org',
'path' => '/hello:12',
'port' => null,
'user' => null,
'query' => 'foo=bar',
'fragment' => 'test'
]
],
[
'/path/to/colon:34',
[
'scheme' => null,
'host' => null,
'path' => '/path/to/colon:34',
'port' => null,
'user' => null,
'query' => null,
'fragment' => null,
]
],
// File scheme
[
'file:///foo/bar',
[
'scheme' => 'file',
'host' => '',
'path' => '/foo/bar',
'port' => null,
'user' => null,
'query' => null,
'fragment' => null,
]
],
// Weird scheme with triple-slash. See Issue #11.
[
'vfs:///somefile',
[
'scheme' => 'vfs',
'host' => '',
'path' => '/somefile',
'port' => null,
'user' => null,
'query' => null,
'fragment' => null,
]
],
// Examples from RFC3986
[
'ldap://[2001:db8::7]/c=GB?objectClass?one',
[
'scheme' => 'ldap',
'host' => '[2001:db8::7]',
'path' => '/c=GB',
'port' => null,
'user' => null,
'query' => 'objectClass?one',
'fragment' => null,
]
],
[
'news:comp.infosystems.www.servers.unix',
[
'scheme' => 'news',
'host' => null,
'path' => 'comp.infosystems.www.servers.unix',
'port' => null,
'user' => null,
'query' => null,
'fragment' => null,
]
],
// Port
[
'http://example.org:8080/',
[
'scheme' => 'http',
'host' => 'example.org',
'path' => '/',
'port' => 8080,
'user' => null,
'query' => null,
'fragment' => null,
]
],

];

Expand Down

0 comments on commit 4c3dd28

Please sign in to comment.