Merge pull request #286 from projectdiscovery/issue-262-incorrect-url…

…-parsing Fix incorrect url parsing
projectdiscovery · Dec 20, 2023 · aafc501 · aafc501
2 parents f1c7371 + 2abde16
commit aafc501
Show file tree

Hide file tree

Showing 4 changed files with 264 additions and 164 deletions.
diff --git a/url/README.md b/url/README.md
@@ -1,6 +1,29 @@
 # urlutil
 The package contains various helpers to interact with URLs
 
+
+## URL Parsing Methods
+
+ Function                                            | Description                                      | Type                          | Behavior                                 |
+-----------------------------------------------------|--------------------------------------------------|-------------------------------|------------------------------------------|
+ `Parse(inputURL string)`                            | Standard URL Parsing (+ Some Edgecases)          | Both Relative & Absolute URLs | NA                                       |
+ `ParseURL(inputURL string, unsafe bool)`            | Standard + Unsafe URL Parsing (+ Edgecases)      | Both Relative & Absolute URLs | NA                                       |
+ `ParseRelativeURL(inputURL string, unsafe bool)`    | Standard + Unsafe URL Parsing (+ Edgecases)      | Only Relative URLs            | error if absolute URL is given           |
+ `ParseRawRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing                    | Only Relative URLs            | error if absolute URL is given           |
+ `ParseAbsoluteURL(inputURL string, unsafe bool)`    | Standard + Unsafe URL Parsing (+ Edgecases)      | Only Absolute URLs            | error if relative URL is given           |
+
+### Known Edgecases / Changes from `url.URL`
+
+- Query Parameters are Ordered
+- Invalid unicode characters and invalid url encodings allowed in unsafe mode
+- `u.Path` is always `/` prefixed if not empty (Except `ParseRawRelativePath`)
+- allows invalid values / encodings in url path
+- Does not encode characters except reserved characters in query parameters (see: Raw Params)
+- almost proper parsing of url into parts (scheme,host,path,query,fragment) [known limitation of manually added hostnames like mydomain (without `.` in hostname)]
+
+
+> More details on each edgecase/behavior is given below
+
 ## difference b/w `net/url.URL` and `utils/url/URL`
 
 - `url.URL` caters to variety of urls and for that reason its parsing is not that accurate under various conditions
@@ -54,3 +77,4 @@ scanme.sh/%invalid/path
 `utils/url/URL` embeds `url.URL` and thus inherits and exposes all `url.URL` methods and variables.
 Its ok to use any method from `url.URL` (directly/indirectly) except `url.URL.Query()` and `url.URL.String()` (due to parameter encoding issues).
 In any case if it is not possible to follow above point (ex: directly updating/referencing `http.Request.URL`) `.Update()` method should be called before accessing them which updates `url.URL` instance for this edgecase. (Not required if above rule is followed)
+
diff --git a/url/parsers.go b/url/parsers.go
@@ -0,0 +1,223 @@
+package urlutil
+
+import (
+	"net/url"
+	"strings"
+
+	errorutil "github.com/projectdiscovery/utils/errors"
+	stringsutil "github.com/projectdiscovery/utils/strings"
+)
+
+// ## URL Parsing Methods
+
+//  Function                                            | Description                                      | Type                          | Behavior                                 |
+// -----------------------------------------------------|--------------------------------------------------|-------------------------------|------------------------------------------|
+//  `Parse(inputURL string)`                            | Standard URL Parsing (+ Some Edgecases)          | Both Relative & Absolute URLs | NA                                       |
+//  `ParseURL(inputURL string, unsafe bool)`            | Standard + Unsafe URL Parsing (+ Edgecases)      | Both Relative & Absolute URLs | NA                                       |
+//  `ParseRelativeURL(inputURL string, unsafe bool)`    | Standard + Unsafe URL Parsing (+ Edgecases)      | Only Relative URLs            | error if absolute URL is given           |
+//  `ParseRawRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing                    | Only Relative URLs            | error if absolute URL is given           |
+//  `ParseAbsoluteURL(inputURL string, unsafe bool)`    | Standard + Unsafe URL Parsing (+ Edgecases)      | Only Absolute URLs            | error if relative URL is given           |
+
+// ParseURL (can be relative or absolute)
+func Parse(inputURL string) (*URL, error) {
+	return ParseURL(inputURL, false)
+}
+
+// Parse and return URL (can be relative or absolute)
+func ParseURL(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:      &url.URL{},
+		Original: inputURL,
+		Unsafe:   unsafe,
+		Params:   NewOrderedParams(),
+	}
+	var err error
+	u, err = absoluteURLParser(u)
+	if err != nil {
+		return nil, err
+	}
+	if u.IsRelative {
+		return ParseRelativePath(inputURL, unsafe)
+	}
+
+	// logical bug url is not relative but host is empty
+	if u.Host == "" {
+		return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
+	}
+
+	// # Normalization 1: if value of u.Host does not look like a common domain
+	// it is most likely a relative path parsed as host
+	// this happens because of ambiguity of url.Parse
+	// because
+	// when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty
+	// to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then
+	// rule out the possiblity that given url is not a relative path
+	// this handles below edgecase
+	// u , err :=  url.Parse(`mypath`)
+
+	if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
+		// TODO: should use a proper regex to validate hostname/ip
+		// currently domain names without (.) are not considered as valid and autocorrected
+		// this does not look like a valid domain , ipv4 or ipv6
+		// consider it as relative
+		// use ParseAbosluteURL to avoid this issue
+		u.IsRelative = true
+		u.Path = inputURL
+		u.Host = ""
+	}
+
+	return u, nil
+}
+
+// ParseAbsoluteURL parses and returns absolute url
+// should be preferred over others when input is known to be absolute url
+// this reduces any normalization and autocorrection related to relative paths
+// and returns error if input is relative path
+func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:      &url.URL{},
+		Original: inputURL,
+		Unsafe:   unsafe,
+		Params:   NewOrderedParams(),
+	}
+	var err error
+	u, err = absoluteURLParser(u)
+	if err != nil {
+		return nil, err
+	}
+	if u.IsRelative {
+		return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path)
+	}
+	if u.URL.Host == "" {
+		return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL)
+	}
+	return u, nil
+}
+
+// ParseRelativePath parses and returns relative path
+// should be preferred over others when input is known to be relative path
+// this reduces any normalization and autocorrection related to absolute paths
+// and returns error if input is absolute path
+func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:        &url.URL{},
+		Original:   inputURL,
+		Unsafe:     unsafe,
+		IsRelative: true,
+	}
+	return relativePathParser(u)
+}
+
+// ParseRelativePath
+func ParseRawRelativePath(inputURL string, unsafe bool) (*URL, error) {
+	u := &URL{
+		URL:                &url.URL{},
+		Original:           inputURL,
+		Unsafe:             unsafe,
+		IsRelative:         true,
+		disableAutoCorrect: true,
+	}
+	return relativePathParser(u)
+}
+
+// absoluteURLParser is common absolute parser logic used to avoid duplication of code
+func absoluteURLParser(u *URL) (*URL, error) {
+	u.fetchParams()
+	// filter out fragments and parameters only then parse path
+	// we use u.Original because u.fetchParams() parses fragments and parameters
+	// from u.Original (this is done to preserve query order in params and other edgecases)
+	if u.Original == "" {
+		return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input")
+	}
+
+	// Note: we consider //scanme.sh as valid  (since all browsers accept this <script src="//ajax.googleapis.com/ajax/xx">)
+	if strings.HasPrefix(u.Original, "/") && !strings.HasPrefix(u.Original, "//") {
+		// this is definitely a relative path
+		u.IsRelative = true
+		u.Path = u.Original
+		return u, nil
+	}
+	// Try to parse host related input
+	if stringsutil.HasPrefixAny(u.Original, HTTP+SchemeSeparator, HTTPS+SchemeSeparator, "//") {
+		u.IsRelative = false
+		urlparse, parseErr := url.Parse(u.Original)
+		if parseErr != nil {
+			// for parse errors in unsafe way try parsing again
+			if u.Unsafe {
+				urlparse = parseUnsafeFullURL(u.Original)
+				if urlparse != nil {
+					parseErr = nil
+				}
+			}
+			if parseErr != nil {
+				return nil, errorutil.NewWithErr(parseErr).Msgf("failed to parse url")
+			}
+		}
+		copy(u.URL, urlparse)
+	} else {
+		// if no prefix try to parse it with https
+		// if failed we consider it as a relative path and not a full url
+		urlparse, parseErr := url.Parse(HTTPS + SchemeSeparator + u.Original)
+		if parseErr != nil {
+			// most likely a relativeurl
+			u.IsRelative = true
+			// TODO: investigate if prefix / should be added
+		} else {
+			urlparse.Scheme = "" // remove newly added scheme
+			copy(u.URL, urlparse)
+		}
+	}
+	return u, nil
+}
+
+// relativePathParser is common relative path parser logic used to avoid duplication of code
+func relativePathParser(u *URL) (*URL, error) {
+	u.fetchParams()
+	urlparse, parseErr := url.Parse(u.Original)
+	if parseErr != nil {
+		if !u.Unsafe {
+			// should return error if not unsafe url
+			return nil, errorutil.NewWithErr(parseErr).WithTag("urlutil").Msgf("failed to parse input url")
+		} else {
+			// if unsafe do not rely on net/url.Parse
+			u.Path = u.Original
+		}
+	}
+	if urlparse != nil {
+		urlparse.Host = ""
+		copy(u.URL, urlparse)
+	}
+	u.parseUnsafeRelativePath()
+	if u.Host != "" {
+		return nil, errorutil.NewWithTag("urlutil", "expected relative path but got absolute path with host=%v,input=%v", u.Host, u.Original)
+	}
+	return u, nil
+}
+
+// parseUnsafeFullURL parses invalid(unsafe) urls (ex: https://scanme.sh/%invalid)
+// this is not supported as per RFC and url.Parse fails
+func parseUnsafeFullURL(urlx string) *url.URL {
+	// we only allow unsupported chars in path
+	// since url.Parse() returns error there isn't any standard way to do this
+	// Current methodology
+	// 1. temp replace `//` schema seperator to avoid collisions
+	// 2. get first index of `/` i.e path seperator (if none skip any furthur preprocessing)
+	// 3. if found split urls into base and path (i.e https://scanme.sh/%invalid => `https://scanme.sh`+`/%invalid`)
+	// 4. Host part is parsed by net/url.URL and path is parsed manually
+	temp := strings.Replace(urlx, "//", "", 1)
+	index := strings.IndexRune(temp, '/')
+	if index == -1 {
+		return nil
+	}
+	urlPath := temp[index:]
+	urlHost := strings.TrimSuffix(urlx, urlPath)
+	parseURL, parseErr := url.Parse(urlHost)
+	if parseErr != nil {
+		return nil
+	}
+	if relpath, err := ParseRelativePath(urlPath, true); err == nil {
+		parseURL.Path = relpath.Path
+		return parseURL
+	}
+	return nil
+}