Skip to content

Commit

Permalink
Use git's partial clone feature to speed up pip
Browse files Browse the repository at this point in the history
Clone with --filter=blob:none - as it fetches all
metadata, but only dynamically fetches the blobs as
needed by checkout. Since typically, pip only needs the blobs for
a single revision, this can be a big improvement, especially
when fetching from repositories with a lot of history,
particularly on slower network connections.

Added unit test for the rev-less path. Confirmed that both
of the if/else paths are tested by the unit tests.
  • Loading branch information
nipunn1313 committed Aug 7, 2021
1 parent 9874fb9 commit fdf47ac
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 1 deletion.
1 change: 1 addition & 0 deletions news/9086.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
When a revision is specified in a Git URL, use git's partial clone feature to speed up source retrieval.
5 changes: 4 additions & 1 deletion src/pip/_internal/vcs/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,15 @@ def fetch_new(self, dest, url, rev_options):
# type: (str, HiddenText, RevOptions) -> None
rev_display = rev_options.to_display()
logger.info('Cloning %s%s to %s', url, rev_display, display_path(dest))
self.run_command(make_command('clone', '-q', url, dest))
self.run_command(make_command(
'clone', '--filter=blob:none', '-q', url, dest,
))

if rev_options.rev:
# Then a specific revision was requested.
rev_options = self.resolve_revision(dest, url, rev_options)
branch_name = getattr(rev_options, 'branch_name', None)
logger.info('Rev options %s, branch_name %s', rev_options, branch_name)
if branch_name is None:
# Only do a checkout if the current commit id doesn't match
# the requested revision.
Expand Down
49 changes: 49 additions & 0 deletions tests/functional/test_vcs_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,52 @@ def test_resolve_commit_not_on_branch(script, tmp_path):
# check we can fetch our commit
rev_options = Git.make_rev_options(commit)
Git().fetch_new(str(clone_path), repo_path.as_uri(), rev_options)


def test_fetch_new(script, tmp_path):
repo_path = tmp_path / "repo"
repo_file = repo_path / "file.txt"
clone_path1 = repo_path / "clone1"
clone_path2 = repo_path / "clone2"

repo_path.mkdir()
script.run("git", "init", cwd=str(repo_path))
repo_file.write_text(u".")
script.run("git", "add", "file.txt", cwd=str(repo_path))
script.run("git", "commit", "-m", "initial commit", cwd=str(repo_path))
commit = script.run("git", "rev-parse", "HEAD", cwd=str(repo_path)).stdout.strip()

# Check that we can clone at HEAD
Git().fetch_new(str(clone_path1), repo_path.as_uri(), Git.make_rev_options())
# Check that we can clone to commit
Git().fetch_new(str(clone_path2), repo_path.as_uri(), Git.make_rev_options(commit))

# Write some additional stuff to git pull
repo_file.write_text(u"..")
script.run("git", "commit", "-am", "second commit", cwd=str(repo_path))

# Make sure git pull works - even though server doesn't support filtering
assert (
"warning: filtering not recognized by server, ignoring"
in script.run("git", "pull", cwd=clone_path1).stderr
)
assert (
"warning: filtering not recognized by server, ignoring"
in script.run("git", "pull", cwd=clone_path2).stderr
)

# Enable filtering support on server
script.run("git", "config", "uploadpack.allowFilter", "true", cwd=repo_path)
script.run("git", "config", "uploadpack.allowanysha1inwant", "true", cwd=repo_path)
repo_file.write_text(u"...")
script.run("git", "commit", "-am", "third commit", cwd=str(repo_path))

# Make sure git pull works - even with server supporting filtering
assert (
"warning: filtering not recognized by server, ignoring"
not in script.run("git", "pull", cwd=clone_path1).stderr
)
assert (
"warning: filtering not recognized by server, ignoring"
not in script.run("git", "pull", cwd=clone_path2).stderr
)

0 comments on commit fdf47ac

Please sign in to comment.