Skip to content

Commit

Permalink
feat(utils): expand parsable valid git remote url formats (#771)
Browse files Browse the repository at this point in the history
Git remote url parsing now supports additional formats (ssh, https, file, git)
  • Loading branch information
codejedi365 committed Dec 22, 2023
1 parent 245e878 commit cf75f23
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 42 deletions.
112 changes: 70 additions & 42 deletions semantic_release/helpers.py
Expand Up @@ -3,6 +3,7 @@
import re
import string
from functools import lru_cache, wraps
from pathlib import PurePosixPath
from typing import Any, Callable, NamedTuple, TypeVar
from urllib.parse import urlsplit

Expand Down Expand Up @@ -73,67 +74,94 @@ def dynamic_import(import_path: str) -> Any:


class ParsedGitUrl(NamedTuple):
"""Container for the elements parsed from a git URL using GIT_URL_REGEX"""
"""Container for the elements parsed from a git URL"""

scheme: str
netloc: str
namespace: str
repo_name: str


GIT_URL_REGEX = re.compile(
r"""
^
(?P<user>[\w\d\-.]+)@
(?P<netloc>[^:]+)
:
(?P<namespace>[\w\.@\:/\-~]+)
/
(?P<repo_name>[\w\.\_\-]+) # Note this also catches the ".git" at the end if present
/?
$
""", # noqa: E501
flags=re.VERBOSE,
)


@lru_cache(maxsize=512)
def parse_git_url(url: str) -> ParsedGitUrl:
"""
Attempt to parse a string as a git url, either https or ssh format, into a
Attempt to parse a string as a git url http[s]://, git://, file://, or ssh format, into a
ParsedGitUrl.
supported examples:
http://git.mycompany.com/username/myproject.git
https://github.com/username/myproject.git
https://gitlab.com/group/subgroup/myproject.git
https://git.mycompany.com:4443/username/myproject.git
git://host.xz/path/to/repo.git/
git://host.xz:9418/path/to/repo.git/
git@github.com:username/myproject.git <-- assumes ssh://
ssh://git@github.com:3759/myproject.git <-- non-standard, but assume user 3759
ssh://git@github.com:username/myproject.git
ssh://git@bitbucket.org:7999/username/myproject.git
git+ssh://git@github.com:username/myproject.git
/Users/username/dev/remote/myproject.git <-- Posix File paths
file:///Users/username/dev/remote/myproject.git
C:/Users/username/dev/remote/myproject.git <-- Windows File paths
file:///C:/Users/username/dev/remote/myproject.git
REFERENCE: https://stackoverflow.com/questions/31801271/what-are-the-supported-git-url-formats
Raises ValueError if the url can't be parsed.
"""
log.debug("Parsing git url %r", url)

# Normalizers are a list of tuples of (pattern, replacement)
normalizers = [
# normalize implicit ssh urls to explicit ssh://
(r"^(\w+@)", r"ssh://\1"),

# normalize git+ssh:// urls to ssh://
(r"^git\+ssh://", "ssh://"),

# normalize an scp like syntax to URL compatible syntax
# excluding port definitions (:#####) & including numeric usernames
(r"(ssh://(?:\w+@)?[\w.]+):(?!\d{1,5}/\w+/)(.*)$", r"\1/\2"),

# normalize implicit file (windows || posix) urls to explicit file:// urls
(r"^([C-Z]:/)|^/(\w)", r"file:///\1\2"),
]

for pattern, replacement in normalizers:
url = re.compile(pattern).sub(replacement, url)

# run the url through urlsplit to separate out the parts
urllib_split = urlsplit(url)
if urllib_split.scheme:
# We have been able to parse the url with urlsplit,
# so it's a (git|ssh|https?)://... structure
namespace, _, name = urllib_split.path.lstrip("/").rpartition("/")
name.rstrip("/")
name = name[:-4] if name.endswith(".git") else name
if not all((urllib_split.scheme, urllib_split.netloc, namespace, name)):
raise ValueError(f"Bad url: {url!r}")

return ParsedGitUrl(
scheme=urllib_split.scheme,
netloc=urllib_split.netloc,
namespace=namespace,
repo_name=name,
)

m = GIT_URL_REGEX.match(url)
if not m:
# Fail if url scheme not found
if not urllib_split.scheme:
raise ValueError(f"Cannot parse {url!r}")

repo_name = m.group("repo_name")
repo_name = repo_name[:-4] if repo_name.endswith(".git") else repo_name
# We have been able to parse the url with urlsplit,
# so it's a (file|git|ssh|https?)://... structure
# but we aren't validating the protocol scheme as its not our business

# use PosixPath to normalize the path & then separate out the namespace & repo_name
namespace, _, name = str(PurePosixPath(urllib_split.path)).lstrip("/").rpartition("/")

if not all((*m.group("netloc", "namespace"), repo_name)):
# strip out the .git at the end of the repo_name if present
name = name[:-4] if name.endswith(".git") else name

# check that we have all the required parts of the url
required_parts = [
urllib_split.scheme,
# Allow empty net location for file:// urls
True if urllib_split.scheme == "file" else urllib_split.netloc,
namespace,
name
]

if not all(required_parts):
raise ValueError(f"Bad url: {url!r}")

return ParsedGitUrl(
scheme="ssh",
netloc=m.group("netloc"),
namespace=m.group("namespace"),
repo_name=repo_name,
scheme=urllib_split.scheme,
netloc=urllib_split.netloc,
namespace=namespace,
repo_name=name,
)
82 changes: 82 additions & 0 deletions tests/unit/semantic_release/test_helpers.py
@@ -0,0 +1,82 @@
import pytest

from semantic_release.helpers import ParsedGitUrl, parse_git_url


@pytest.mark.parametrize(('url', 'expected'), [
(
"http://git.mycompany.com/username/myproject.git",
ParsedGitUrl("http", "git.mycompany.com", "username", "myproject")
),
(
"https://github.com/username/myproject.git",
ParsedGitUrl("https", "github.com", "username", "myproject")
),
(
"https://gitlab.com/group/subgroup/myproject.git",
ParsedGitUrl("https", "gitlab.com", "group/subgroup", "myproject")
),
(
"https://git.mycompany.com:4443/username/myproject.git",
ParsedGitUrl("https", "git.mycompany.com:4443", "username", "myproject")
),
(
"git://host.xz/path/to/repo.git/",
ParsedGitUrl("git", "host.xz", "path/to", "repo")
),
(
"git://host.xz:9418/path/to/repo.git/",
ParsedGitUrl("git", "host.xz:9418", "path/to", "repo")
),
(
"git@github.com:username/myproject.git",
ParsedGitUrl("ssh", "git@github.com", "username", "myproject")
),
(
"ssh://git@github.com:3759/myproject.git",
ParsedGitUrl("ssh", "git@github.com", "3759", "myproject")
),
(
"ssh://git@github.com:username/myproject.git",
ParsedGitUrl("ssh", "git@github.com", "username", "myproject")
),
(
"ssh://git@bitbucket.org:7999/username/myproject.git",
ParsedGitUrl("ssh", "git@bitbucket.org:7999", "username", "myproject")
),
(
"git+ssh://git@github.com:username/myproject.git",
ParsedGitUrl("ssh", "git@github.com", "username", "myproject")
),
(
"/Users/username/dev/remote/myproject.git",
ParsedGitUrl("file", "", "Users/username/dev/remote", "myproject")
),
(
"file:///Users/username/dev/remote/myproject.git",
ParsedGitUrl("file", "", "Users/username/dev/remote", "myproject")
),
(
"C:/Users/username/dev/remote/myproject.git",
ParsedGitUrl("file", "", "C:/Users/username/dev/remote", "myproject")
),
(
"file:///C:/Users/username/dev/remote/myproject.git",
ParsedGitUrl("file", "", "C:/Users/username/dev/remote", "myproject")
),
])
def test_parse_valid_git_urls(url: str, expected: ParsedGitUrl):
"""Test that a valid given git remote url is parsed correctly."""
assert expected == parse_git_url(url)


@pytest.mark.parametrize('url', [
"icmp://git",
"abcdefghijklmnop.git",
"../relative/path/to/repo.git",
"http://domain/project.git"
])
def test_parse_invalid_git_urls(url: str):
"""Test that an invalid git remote url throws a ValueError."""
with pytest.raises(ValueError):
parse_git_url(url)

0 comments on commit cf75f23

Please sign in to comment.