Skip to content

Commit

Permalink
Merge pull request #286 from projectdiscovery/issue-262-incorrect-url…
Browse files Browse the repository at this point in the history
…-parsing

Fix incorrect url parsing
  • Loading branch information
tarunKoyalwar committed Dec 20, 2023
2 parents f1c7371 + 2abde16 commit aafc501
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 164 deletions.
24 changes: 24 additions & 0 deletions url/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
# urlutil
The package contains various helpers to interact with URLs


## URL Parsing Methods

Function | Description | Type | Behavior |
-----------------------------------------------------|--------------------------------------------------|-------------------------------|------------------------------------------|
`Parse(inputURL string)` | Standard URL Parsing (+ Some Edgecases) | Both Relative & Absolute URLs | NA |
`ParseURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Both Relative & Absolute URLs | NA |
`ParseRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Relative URLs | error if absolute URL is given |
`ParseRawRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing | Only Relative URLs | error if absolute URL is given |
`ParseAbsoluteURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Absolute URLs | error if relative URL is given |

### Known Edgecases / Changes from `url.URL`

- Query Parameters are Ordered
- Invalid unicode characters and invalid url encodings allowed in unsafe mode
- `u.Path` is always `/` prefixed if not empty (Except `ParseRawRelativePath`)
- allows invalid values / encodings in url path
- Does not encode characters except reserved characters in query parameters (see: Raw Params)
- almost proper parsing of url into parts (scheme,host,path,query,fragment) [known limitation of manually added hostnames like mydomain (without `.` in hostname)]


> More details on each edgecase/behavior is given below
## difference b/w `net/url.URL` and `utils/url/URL`

- `url.URL` caters to variety of urls and for that reason its parsing is not that accurate under various conditions
Expand Down Expand Up @@ -54,3 +77,4 @@ scanme.sh/%invalid/path
`utils/url/URL` embeds `url.URL` and thus inherits and exposes all `url.URL` methods and variables.
Its ok to use any method from `url.URL` (directly/indirectly) except `url.URL.Query()` and `url.URL.String()` (due to parameter encoding issues).
In any case if it is not possible to follow above point (ex: directly updating/referencing `http.Request.URL`) `.Update()` method should be called before accessing them which updates `url.URL` instance for this edgecase. (Not required if above rule is followed)

223 changes: 223 additions & 0 deletions url/parsers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
package urlutil

import (
"net/url"
"strings"

errorutil "github.com/projectdiscovery/utils/errors"
stringsutil "github.com/projectdiscovery/utils/strings"
)

// ## URL Parsing Methods

// Function | Description | Type | Behavior |
// -----------------------------------------------------|--------------------------------------------------|-------------------------------|------------------------------------------|
// `Parse(inputURL string)` | Standard URL Parsing (+ Some Edgecases) | Both Relative & Absolute URLs | NA |
// `ParseURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Both Relative & Absolute URLs | NA |
// `ParseRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Relative URLs | error if absolute URL is given |
// `ParseRawRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing | Only Relative URLs | error if absolute URL is given |
// `ParseAbsoluteURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Absolute URLs | error if relative URL is given |

// ParseURL (can be relative or absolute)
func Parse(inputURL string) (*URL, error) {
return ParseURL(inputURL, false)
}

// Parse and return URL (can be relative or absolute)
func ParseURL(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
Params: NewOrderedParams(),
}
var err error
u, err = absoluteURLParser(u)
if err != nil {
return nil, err
}
if u.IsRelative {
return ParseRelativePath(inputURL, unsafe)
}

// logical bug url is not relative but host is empty
if u.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
}

// # Normalization 1: if value of u.Host does not look like a common domain
// it is most likely a relative path parsed as host
// this happens because of ambiguity of url.Parse
// because
// when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty
// to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then
// rule out the possiblity that given url is not a relative path
// this handles below edgecase
// u , err := url.Parse(`mypath`)

if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
// TODO: should use a proper regex to validate hostname/ip
// currently domain names without (.) are not considered as valid and autocorrected
// this does not look like a valid domain , ipv4 or ipv6
// consider it as relative
// use ParseAbosluteURL to avoid this issue
u.IsRelative = true
u.Path = inputURL
u.Host = ""
}

return u, nil
}

// ParseAbsoluteURL parses and returns absolute url
// should be preferred over others when input is known to be absolute url
// this reduces any normalization and autocorrection related to relative paths
// and returns error if input is relative path
func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
Params: NewOrderedParams(),
}
var err error
u, err = absoluteURLParser(u)
if err != nil {
return nil, err
}
if u.IsRelative {
return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path)
}
if u.URL.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL)
}
return u, nil
}

// ParseRelativePath parses and returns relative path
// should be preferred over others when input is known to be relative path
// this reduces any normalization and autocorrection related to absolute paths
// and returns error if input is absolute path
func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
IsRelative: true,
}
return relativePathParser(u)
}

// ParseRelativePath
func ParseRawRelativePath(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
IsRelative: true,
disableAutoCorrect: true,
}
return relativePathParser(u)
}

// absoluteURLParser is common absolute parser logic used to avoid duplication of code
func absoluteURLParser(u *URL) (*URL, error) {
u.fetchParams()
// filter out fragments and parameters only then parse path
// we use u.Original because u.fetchParams() parses fragments and parameters
// from u.Original (this is done to preserve query order in params and other edgecases)
if u.Original == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input")
}

// Note: we consider //scanme.sh as valid (since all browsers accept this <script src="//ajax.googleapis.com/ajax/xx">)
if strings.HasPrefix(u.Original, "/") && !strings.HasPrefix(u.Original, "//") {
// this is definitely a relative path
u.IsRelative = true
u.Path = u.Original
return u, nil
}
// Try to parse host related input
if stringsutil.HasPrefixAny(u.Original, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") {
u.IsRelative = false
urlparse, parseErr := url.Parse(u.Original)
if parseErr != nil {
// for parse errors in unsafe way try parsing again
if u.Unsafe {
urlparse = parseUnsafeFullURL(u.Original)
if urlparse != nil {
parseErr = nil
}
}
if parseErr != nil {
return nil, errorutil.NewWithErr(parseErr).Msgf("failed to parse url")
}
}
copy(u.URL, urlparse)
} else {
// if no prefix try to parse it with https
// if failed we consider it as a relative path and not a full url
urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + u.Original)
if parseErr != nil {
// most likely a relativeurl
u.IsRelative = true
// TODO: investigate if prefix / should be added
} else {
urlparse.Scheme = "" // remove newly added scheme
copy(u.URL, urlparse)
}
}
return u, nil
}

// relativePathParser is common relative path parser logic used to avoid duplication of code
func relativePathParser(u *URL) (*URL, error) {
u.fetchParams()
urlparse, parseErr := url.Parse(u.Original)
if parseErr != nil {
if !u.Unsafe {
// should return error if not unsafe url
return nil, errorutil.NewWithErr(parseErr).WithTag("urlutil").Msgf("failed to parse input url")
} else {
// if unsafe do not rely on net/url.Parse
u.Path = u.Original
}
}
if urlparse != nil {
urlparse.Host = ""
copy(u.URL, urlparse)
}
u.parseUnsafeRelativePath()
if u.Host != "" {
return nil, errorutil.NewWithTag("urlutil", "expected relative path but got absolute path with host=%v,input=%v", u.Host, u.Original)
}
return u, nil
}

// parseUnsafeFullURL parses invalid(unsafe) urls (ex: https://scanme.sh/%invalid)
// this is not supported as per RFC and url.Parse fails
func parseUnsafeFullURL(urlx string) *url.URL {
// we only allow unsupported chars in path
// since url.Parse() returns error there isn't any standard way to do this
// Current methodology
// 1. temp replace `//` schema seperator to avoid collisions
// 2. get first index of `/` i.e path seperator (if none skip any furthur preprocessing)
// 3. if found split urls into base and path (i.e https://scanme.sh/%invalid => `https://scanme.sh`+`/%invalid`)
// 4. Host part is parsed by net/url.URL and path is parsed manually
temp := strings.Replace(urlx, "//", "", 1)
index := strings.IndexRune(temp, '/')
if index == -1 {
return nil
}
urlPath := temp[index:]
urlHost := strings.TrimSuffix(urlx, urlPath)
parseURL, parseErr := url.Parse(urlHost)
if parseErr != nil {
return nil
}
if relpath, err := ParseRelativePath(urlPath, true); err == nil {
parseURL.Path = relpath.Path
return parseURL
}
return nil
}
Loading

0 comments on commit aafc501

Please sign in to comment.