In [69]:
import requests
import os
import shutil
import subprocess
from pathlib import Path
from tqdm import tqdm

GITHUB_API = "https://api.github.com"

with open("../../gh_access.txt", "r") as file:
    ACCESS_TOKEN = file.read().strip("\n")

In [70]:
def get_repos_with_query(query, num_repos):
    """
    Fetch repositories using a specific GitHub API query.
    """
    headers = {'Authorization': f'token {ACCESS_TOKEN}'}
    params = {
        'q': query,
        'sort': 'stars',
        'order': 'desc',
        'per_page': num_repos
    }
    response = requests.get(f"{GITHUB_API}/search/repositories", headers=headers, params=params)
    if response.status_code == 200:
        return response.json()['items']
    else:
        print(f"Failed to fetch repositories: {response.status_code}")
        return []

def get_popular_repos(num_repos_per_query=1):
    """
    Fetch popular repositories using different queries.
    """
    repos = []
    queries = [
        'language:python',
        'django in:name,description',
        'flask in:name,description',
        'data-science in:name,description',
        'machine-learning in:name,description'
    ]
    
    for query in queries:
        repos.extend(get_repos_with_query(query, num_repos_per_query))

    # Removing potential duplicates
    unique_repos = {repo['id']: repo for repo in repos}.values()
    
    return list(unique_repos)

def clone_repo(repo_url, output_dir):
    """
    Clone a repository into a specified directory.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    subprocess.run(["git", "clone", repo_url, output_dir])


def extract_python_files(repo_dir, output_dir):
    """
    Extract all Python files from a repository and place them in a flat structure in the output directory.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for root, dirs, files in os.walk(repo_dir):
        for file in files:
            if file.endswith(".py"):
                file_path = os.path.join(root, file)
                shutil.copy(file_path, os.path.join(output_dir, file))

# Rationale
- Medium number of repositories
- Diverse uses of python
- Popular repositories reflect current python trends
- Popular packages are programmed by professionals and have good code quality
- Personal repositories reflect personal use cases of python
- Liked repositories reflect personal interests

In [71]:
# Get the top 10 Python repositories by stars
all_popular_repos = get_popular_repos(3)
popular_repos = all_popular_repos[:10]
backup_repos = all_popular_repos[10:]

In [72]:
len(popular_repos), len(backup_repos)

(10, 5)

In [73]:
# Get 5 of the most widely used Python packages
popular_package_repos_urls = [
    "https://github.com/numpy/numpy",
    "https://github.com/pandas-dev/pandas",
    "https://github.com/matplotlib/matplotlib",
    "https://github.com/scikit-learn/scikit-learn",
    "https://github.com/python-pillow/Pillow"
]

In [74]:
# Get some personal repositories
personal_repos_urls = [
    "https://github.com/psaegert/pmtrendviz",
    "https://github.com/psaegert/nli-nec"
]

In [75]:
# Get some repositories that I like
liked_repos_urls = [
    "https://github.com/graphdeco-inria/gaussian-splatting",
    "https://github.com/lllyasviel/ControlNet",
    "https://github.com/maltfield/awesome-lemmy-instances",
    "https://github.com/Aleph-Alpha/aleph-alpha-client",
    "https://github.com/MaartenGr/BERTopic",
    "https://github.com/MilesCranmer/PySR",
    "https://github.com/AUTOMATIC1111/stable-diffusion-webui",
    "https://github.com/microsoft/Codex-CLI",
]

In [76]:
# Combine the lists into a list of tuples of (repo_url, repo_name)
repos_urls = popular_package_repos_urls + personal_repos_urls + liked_repos_urls
repos_names = [url.split("/")[-1] for url in repos_urls]
repos = list(zip(repos_urls, repos_names))

# Add the popular repositories to the list
repos.extend([(repo['html_url'], repo['name']) for repo in popular_repos])

# Lastly, add the backup repositories (they will be skipped in case the goal of 25 is reached)
repos.extend([(repo['html_url'], repo['name']) for repo in backup_repos])

In [77]:
if not len(set(repos)) == len(repos):
    # Find the duplicate
    seen = set()
    for repo in repos:
        if repo in seen:
            print(repo)
        else:
            seen.add(repo)
    raise Exception("Duplicate repositories found")

In [81]:
base_output_dir = "scraped_repos"
os.makedirs(base_output_dir, exist_ok=True)

In [79]:
len(repos)

30

In [82]:
for repo_url, repo_name in tqdm(repos):
    clone_repo_dir = f"cloned_repos/{repo_name}"
    output_repo_dir = os.path.join(base_output_dir, repo_name)

    # Check if the repository has already been cloned
    if os.path.exists(os.path.join(base_output_dir, repo_name)):
        continue

    if len(os.listdir("scraped_repos")) >= 25:
        continue
    
    clone_repo(repo_url, clone_repo_dir)
    extract_python_files(clone_repo_dir, output_repo_dir)

    # If the repository is empty, remove the directory
    if len(os.listdir(output_repo_dir)) == 0:
        shutil.rmtree(output_repo_dir)
        continue
    
    with open("repositories.txt", "a") as file:
        file.write(repo_url + "\n")

    # Optionally, remove the cloned repo directory to save space
    shutil.rmtree(clone_repo_dir)

  0%|          | 0/30 [00:00<?, ?it/s]Cloning into 'cloned_repos/numpy'...
  3%|▎         | 1/30 [00:21<10:22, 21.46s/it]Cloning into 'cloned_repos/pandas'...
  7%|▋         | 2/30 [01:09<17:13, 36.92s/it]Cloning into 'cloned_repos/matplotlib'...
 10%|█         | 3/30 [03:01<32:01, 71.18s/it]Cloning into 'cloned_repos/scikit-learn'...
 13%|█▎        | 4/30 [03:25<22:52, 52.80s/it]Cloning into 'cloned_repos/Pillow'...
 17%|█▋        | 5/30 [03:51<17:58, 43.16s/it]Cloning into 'cloned_repos/pmtrendviz'...
 20%|██        | 6/30 [03:52<11:28, 28.68s/it]Cloning into 'cloned_repos/nli-nec'...
 23%|██▎       | 7/30 [04:07<09:16, 24.19s/it]Cloning into 'cloned_repos/gaussian-splatting'...
 27%|██▋       | 8/30 [04:09<06:16, 17.11s/it]Cloning into 'cloned_repos/ControlNet'...
 30%|███       | 9/30 [04:32<06:40, 19.05s/it]Cloning into 'cloned_repos/awesome-lemmy-instances'...
 33%|███▎      | 10/30 [04:36<04:49, 14.45s/it]Cloning into 'cloned_repos/aleph-alpha-client'...
 37%|███▋      | 11/30 [

In [83]:
# Remove the 'cloned_repos' directory
shutil.rmtree("cloned_repos")

In [84]:
# Read the respositories.txt file and check that all repositories have been scraped
with open("repositories.txt", "r") as file:
    urls = file.read()
    # Remove the last newline character
    urls = urls[:-1]
    urls = urls.split("\n")

repo_names = [url.split("/")[-1] for url in urls]
folder_names = os.listdir(base_output_dir)

for repo_name in repo_names:
    if repo_name not in folder_names:
        print(f"Missing repository: {repo_name}")