/
parallel_clone_repos.py
56 lines (41 loc) · 1.55 KB
/
parallel_clone_repos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import subprocess
from multiprocessing import Pool
from github import Github
ORG = "huggingface"
MIRROR_DIRECTORY = "hf_public_repos"
def get_repos(username, access_token=None, include_fork=False):
"""Fetches repositories for a particular GitHub user.
Courtesy: Chansung Park.
"""
g = Github(access_token)
user = g.get_user(username)
results = []
for repo in user.get_repos():
if repo.fork is False:
results.append(repo.name)
else:
if include_fork is True:
results.append(repo.name)
return results
def mirror_repository(repository):
"""Locally clones a repository."""
repository_url = f"https://github.com/{ORG}/{repository}.git"
repository_path = os.path.join(MIRROR_DIRECTORY, repository)
# Clone the repository
subprocess.run(["git", "clone", repository_url, repository_path])
def mirror_repositories():
# Create the mirror directory if it doesn't exist
if not os.path.exists(MIRROR_DIRECTORY):
os.makedirs(MIRROR_DIRECTORY)
# Get the list of repositories in the organization
if not os.environ["GH_ACCESS_TOKEN"]:
raise ValueError("You must set `GH_ACCESS_TOKEN` as an env variable.")
repositories = get_repos(ORG, os.environ["GH_ACCESS_TOKEN"])
print(f"Total repositories found: {len(repositories)}.")
# Mirror repositories using multiprocessing
print("Cloning repositories.")
with Pool() as pool:
pool.map(mirror_repository, repositories)
if __name__ == "__main__":
mirror_repositories()