Skip to content

Commit

Permalink
Uses shallow clones as default for Git operations
Browse files Browse the repository at this point in the history
This is to avoid exhausting the resources while cloning very large
repos (such as github.com/openshift/origin, which consumes ~3GB
memory).

In case the "include-git-dir" flag is set, a deep clone will still
be used.

Signed-off-by: Bruno Pimentel <bpimente@redhat.com>
  • Loading branch information
brunoapimentel committed Jan 4, 2022
1 parent 3600456 commit 3603ce3
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 28 deletions.
2 changes: 2 additions & 0 deletions README.md
Expand Up @@ -594,6 +594,8 @@ with the `mod_auth_gssapi` module.

* `include-git-dir` - when used, `.git` file objects are not removed from the source bundle created
by Cachito. This is useful when the git history is important to the build process.
If this flag is absent, Cachito will use a shallow clone (depth=1) of the repository in order to save
resources.

* `cgo-disable` - use this flag to make Cachito set `CGO_ENABLED=0` while processing gomod packages.
This environment variable will only be used internally by Cachito, it will *not* be set in the
Expand Down
6 changes: 5 additions & 1 deletion cachito/web/api_v1.py
Expand Up @@ -346,7 +346,11 @@ def create_request():
error_callback = tasks.failed_request_callback.s(request.id)
chain_tasks = [
tasks.fetch_app_source.s(
request.repo, request.ref, request.id, "git-submodule" in pkg_manager_names
request.repo,
request.ref,
request.id,
"git-submodule" in pkg_manager_names,
"include-git-dir" in request.flags,
).on_error(error_callback)
]

Expand Down
42 changes: 29 additions & 13 deletions cachito/workers/scm.py
Expand Up @@ -143,24 +143,33 @@ def _create_archive(self, from_dir):
os.unlink(self.sources_dir.archive_path)
raise

def clone_and_archive(self, gitsubmodule=False):
def clone_and_archive(self, gitsubmodule=False, shallow_clone=False):
"""
Clone the git repository and create the compressed source archive.
:param bool gitsubmodule: a bool to determine whether git submodules need to be processed.
:param bool shallow_clone: determines if a shallow clone should be made (depth=1).
:raises CachitoError: if cloning the repository fails or if the archive can't be created
"""
with tempfile.TemporaryDirectory(prefix="cachito-") as temp_dir:
log.debug("Cloning the Git repository from %s", self.url)
clone_path = os.path.join(temp_dir, "repo")

kwargs = {
"no_checkout": True,
# Don't allow git to prompt for a username if we don't have access
"env": {"GIT_TERMINAL_PROMPT": "0"},
}

if shallow_clone:
kwargs["depth"] = 1

try:
repo = git.repo.Repo.clone_from(
self.url,
clone_path,
no_checkout=True,
# Don't allow git to prompt for a username if we don't have access
env={"GIT_TERMINAL_PROMPT": "0"},
)
repo = git.repo.Repo.clone_from(self.url, clone_path, **kwargs)

if shallow_clone:
# Fetch the exact commit we need
repo.remote().fetch(refspec=self.ref, depth=1)
except: # noqa E722
log.exception("Cloning the Git repository from %s failed", self.url)
raise CachitoError("Cloning the Git repository failed")
Expand All @@ -173,12 +182,13 @@ def clone_and_archive(self, gitsubmodule=False):
repo.git.gc("--prune=now")
self._create_archive(repo.working_dir)

def update_and_archive(self, previous_archive, gitsubmodule=False):
def update_and_archive(self, previous_archive, gitsubmodule=False, shallow_fetch=False):
"""
Update the existing Git history and create a source archive.
:param str previous_archive: path to an archive file created before.
:param bool gitsubmodule: a bool to determine whether git submodules need to be processed.
:param bool shallow_fetch: determines if only a single reference should be fetched.
:raises CachitoError: if pulling the Git history from the remote repo or
the checkout of the target Git ref fails.
"""
Expand All @@ -187,10 +197,13 @@ def update_and_archive(self, previous_archive, gitsubmodule=False):
tar.extractall(temp_dir)

repo = git.Repo(os.path.join(temp_dir, "app"))

kwargs = {"depth": 1} if shallow_fetch else {}

try:
# The reference must be specified to handle commits which are not part
# of a branch.
repo.remote().fetch(refspec=self.ref)
repo.remote().fetch(refspec=self.ref, **kwargs)
except: # noqa E722
log.exception("Failed to fetch from remote %s", self.url)
raise CachitoError("Failed to fetch from the remote Git repository")
Expand All @@ -202,10 +215,11 @@ def update_and_archive(self, previous_archive, gitsubmodule=False):
repo.git.gc("--prune=now")
self._create_archive(repo.working_dir)

def fetch_source(self, gitsubmodule=False):
def fetch_source(self, gitsubmodule=False, shallow_clone=False):
"""Fetch the repo, create a compressed tar file, and put it in long-term storage.
:param bool gitsubmodule: a bool to determine whether git submodules need to be processed.
:param bool shallow_clone: determines if a shallow clone should be made (depth=1).
"""
if gitsubmodule:
self.sources_dir = SourcesDir(self.repo_name, f"{self.ref}-with-submodules")
Expand Down Expand Up @@ -235,7 +249,9 @@ def fetch_source(self, gitsubmodule=False):
if "-with-submodules" in str(previous_archive):
continue
try:
self.update_and_archive(previous_archive, gitsubmodule=gitsubmodule)
self.update_and_archive(
previous_archive, gitsubmodule=gitsubmodule, shallow_fetch=shallow_clone,
)
return
except (
git.exc.InvalidGitRepositoryError,
Expand All @@ -253,7 +269,7 @@ def fetch_source(self, gitsubmodule=False):
previous_archive,
)

self.clone_and_archive(gitsubmodule=gitsubmodule)
self.clone_and_archive(gitsubmodule=gitsubmodule, shallow_clone=shallow_clone)

def update_git_submodules(self, repo):
"""Update git submodules.
Expand Down
6 changes: 4 additions & 2 deletions cachito/workers/tasks/general.py
Expand Up @@ -37,7 +37,7 @@

@app.task(priority=0)
@runs_if_request_in_progress
def fetch_app_source(url, ref, request_id, gitsubmodule=False):
def fetch_app_source(url, ref, request_id, gitsubmodule=False, include_git_dir=False):
"""
Fetch the application source code that was requested and put it in long-term storage.
Expand All @@ -51,7 +51,9 @@ def fetch_app_source(url, ref, request_id, gitsubmodule=False):
try:
# Default to Git for now
scm = Git(url, ref)
scm.fetch_source(gitsubmodule=gitsubmodule)
# In case don't need to keep the Git history, use a shallow clone to save resources
shallow_clone = not include_git_dir
scm.fetch_source(gitsubmodule=gitsubmodule, shallow_clone=shallow_clone)
except requests.Timeout:
raise CachitoError("The connection timed out while downloading the source")
except CachitoError:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_api_v1.py
Expand Up @@ -162,6 +162,7 @@ def test_create_and_fetch_request(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
"git-submodule" in expected_pkg_managers,
"include-git-dir" in flags if flags is not None else False,
).on_error(error_callback)
]
if "gomod" in expected_pkg_managers:
Expand Down Expand Up @@ -220,6 +221,7 @@ def test_create_request_with_gomod_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_gomod_source.si(1, [], package_value["gomod"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -250,6 +252,7 @@ def test_create_request_with_npm_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_npm_source.si(1, package_value["npm"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -290,6 +293,7 @@ def test_create_request_with_pip_package_configs(mock_chain, app, auth_env, clie
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_pip_source.si(1, package_value["pip"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -320,6 +324,7 @@ def test_create_request_with_yarn_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_yarn_source.si(1, package_value["yarn"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -372,6 +377,7 @@ def test_create_and_fetch_request_with_flag(mock_chain, app, auth_env, client, d
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_gomod_source.si(1, [], []).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down
47 changes: 35 additions & 12 deletions tests/test_workers/test_scm.py
Expand Up @@ -30,7 +30,9 @@ def test_repo_name():
assert git_obj.repo_name == "release-engineering/retrodep"


@pytest.mark.parametrize("gitsubmodule", [True, False])
@pytest.mark.parametrize(
"gitsubmodule, shallow_clone", [(True, False), (False, False), (True, True), (False, True)]
)
@mock.patch("tarfile.open")
@mock.patch("tempfile.TemporaryDirectory")
@mock.patch("tempfile.NamedTemporaryFile")
Expand All @@ -51,6 +53,7 @@ def test_clone_and_archive(
mock_temp_dir,
mock_tarfile_open,
gitsubmodule,
shallow_clone,
):
# Mock the archive being created
mock_exists.return_value = True
Expand All @@ -69,14 +72,16 @@ def test_clone_and_archive(
git_obj = scm.Git(url, ref)

with mock.patch.object(git_obj.sources_dir, "archive_path", new=archive_path):
git_obj.clone_and_archive(gitsubmodule)
git_obj.clone_and_archive(gitsubmodule, shallow_clone)

kwargs = {"depth": 1} if shallow_clone else {}

# Verify the tempfile.TemporaryDirectory context manager was used twice:
# once for _clone_and_archive and once for _verify_archive
assert mock_temp_dir.return_value.__enter__.call_count == 2
# Verify the repo was cloned and checked out properly
mock_clone.assert_called_once_with(
url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"}
url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"}, **kwargs
)
assert mock_clone.return_value.head.reference == mock_commit
mock_clone.return_value.head.reset.assert_called_once_with(index=True, working_tree=True)
Expand All @@ -89,6 +94,9 @@ def test_clone_and_archive(
mock_ugs.assert_called_once_with(mock_clone.return_value)
else:
mock_ugs.assert_not_called()
# In case a shallow clone was made, we also need to fetch the exact commit needed
if shallow_clone:
mock_clone.return_value.remote().fetch.assert_called_once_with(refspec=ref, **kwargs)

mock_clone.return_value.git.gc.assert_called_once_with("--prune=now")

Expand Down Expand Up @@ -155,7 +163,7 @@ def test_fetch_source_clone_if_no_archive_yet(mock_clone_and_archive, gitsubmodu
with po(scm_git.sources_dir.package_dir, "glob", return_value=[]):
scm_git.fetch_source(gitsubmodule)

mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule)
mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule, shallow_clone=False)


@pytest.mark.parametrize("gitsubmodule", [True, False])
Expand Down Expand Up @@ -194,7 +202,9 @@ def test_fetch_source_by_pull(mock_update_and_archive, mock_getctime, gitsubmodu
return_value=["29eh2a.tar.gz", "a8c2d2.tar.gz", "a8c2d2-with-submodules.tar.gz"],
):
scm_git.fetch_source(gitsubmodule)
mock_update_and_archive.assert_called_once_with("a8c2d2.tar.gz", gitsubmodule=gitsubmodule)
mock_update_and_archive.assert_called_once_with(
"a8c2d2.tar.gz", gitsubmodule=gitsubmodule, shallow_fetch=False
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -242,25 +252,36 @@ def test_fetch_source_by_pull_corrupt_archive(

assert mock_update_and_archive.call_count == 2
calls = [
mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule),
mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule),
mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule, shallow_fetch=False),
mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule, shallow_fetch=False),
]
mock_update_and_archive.assert_has_calls(calls)
if all_corrupt:
mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule)
mock_clone_and_archive.assert_called_once_with(
gitsubmodule=gitsubmodule, shallow_clone=False
)
else:
mock_clone_and_archive.assert_not_called()


@pytest.mark.parametrize("gitsubmodule", [True, False])
@pytest.mark.parametrize(
"gitsubmodule, shallow_fetch", [(True, False), (False, False), (True, True), (False, True)]
)
@mock.patch("tarfile.open")
@mock.patch("tempfile.TemporaryDirectory")
@mock.patch("git.Repo")
@mock.patch("cachito.workers.scm.run_cmd")
@mock.patch("os.path.exists")
@mock.patch("cachito.workers.scm.Git.update_git_submodules")
def test_update_and_archive(
mock_ugs, mock_exists, mock_fsck, mock_repo, mock_temp_dir, mock_tarfile_open, gitsubmodule
mock_ugs,
mock_exists,
mock_fsck,
mock_repo,
mock_temp_dir,
mock_tarfile_open,
gitsubmodule,
shallow_fetch,
):
# Mock the archive being created
mock_exists.return_value = True
Expand All @@ -270,16 +291,18 @@ def test_update_and_archive(
# Mock the tempfile.TemporaryDirectory context manager
mock_temp_dir.return_value.__enter__.return_value = "/tmp/cachito-temp"

kwargs = {"depth": 1} if shallow_fetch else {}

# Test does not really extract this archive file. The filename could be arbitrary.
scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule)
scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule, shallow_fetch)

# Verify the tempfile.TemporaryDirectory context manager was used twice:
# once for _update_and_archive and once for _verify_archive
assert mock_temp_dir.return_value.__enter__.call_count == 2

repo = mock_repo.return_value
# Verify the changes are pulled.
repo.remote.return_value.fetch.assert_called_once_with(refspec=ref)
repo.remote.return_value.fetch.assert_called_once_with(refspec=ref, **kwargs)
# Verify the repo is reset to specific ref
repo.commit.assert_called_once_with(ref)
assert repo.commit.return_value == repo.head.reference
Expand Down

0 comments on commit 3603ce3

Please sign in to comment.