In [1]:
import os
import time
import pathlib
import subprocess
import shutil
import re
from pathlib import Path


from dotenv import load_dotenv

load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
REPO_OWNER = os.getenv("REPO_OWNER")
REPO_NAME = os.getenv("REPO_NAME")
BRANCH_NAME = os.getenv("BRANCH_NAME")

WORKSPACE_FOLDER = "clean_workspace"

In [2]:
get_file_id = lambda url: re.search(r"([A-Za-z0-9_-]{33})", url).group(1) if re.search(r"([A-Za-z0-9_-]{33})", url) else None

def reset_directory(DIR_TO_RESET):
    if Path(DIR_TO_RESET).exists():
        shutil.rmtree(DIR_TO_RESET)

    Path(DIR_TO_RESET).mkdir(parents=True, exist_ok=True)
    print(f"Directory {DIR_TO_RESET} Cleaned up!")


def clone_repo(
    *,
    repo_owner: str,
    repo_name: str,
    branch: str | None = None,
    tag: str | None = None,
    dest_dir: str = "workspace",
    github_token: str | None = None,
    reset_dest_directory: bool = True
):
    """
    Clone a GitHub repo into `dest_dir`.

    Parameters
    ----------
    repo_owner   : GitHub org / user (e.g. "TuringGpt")
    repo_name    : Repository name   (e.g. "google-agents-api-gen")
    branch       : Branch to check out.  None → default branch.
    tag          : Tag to check out. Takes precedence over branch if both specified.
    dest_dir     : Local folder path to clone into.
    github_token : Personal-access token.  None → anonymous / public clone.
    reset_dest_directory  : If True and dest_dir exists, delete it before cloning.
    """

    dest = pathlib.Path(dest_dir).expanduser().resolve()

    # ─── 1.  Prepare destination ────────────────────────────────────────────
    if reset_dest_directory:
        reset_directory(dest_dir)

    # ─── 2.  Compose repo URL ───────────────────────────────────────────────
    if github_token:
        repo_url = f"https://{github_token}@github.com/{repo_owner}/{repo_name}.git"
        # Avoid printing the token if the command is later echoed
        safe_repo_url = f"https://***TOKEN***@github.com/{repo_owner}/{repo_name}.git"
    else:
        repo_url = f"https://github.com/{repo_owner}/{repo_name}.git"
        safe_repo_url = repo_url

    # ─── 3.  Determine what to checkout ─────────────────────────────────────
    checkout_ref = None
    checkout_type = None
    
    if tag:
        checkout_ref = tag
        checkout_type = "tag"
    elif branch:
        checkout_ref = branch
        checkout_type = "branch"
    
    cmd = ["git", "clone"]
    if checkout_ref:
        cmd += ["--branch", checkout_ref, "--single-branch"]
    cmd += [repo_url, str(dest)]

    # ─── 4.  Run and time it ────────────────────────────────────────────────
    print("─── Git clone ─────────────────────────────────────────────────")
    print("Repo      :", f"{repo_owner}/{repo_name}")
    if checkout_ref:
        print(f"{checkout_type.capitalize():<8} :", checkout_ref)
    print("Target dir:", dest)
    t0 = time.perf_counter()

    # Use env to suppress accidental token leakage in subprocess noise
    env = os.environ.copy()
    env["GIT_TERMINAL_PROMPT"] = "0"  # fail fast on auth errors

    try:
        subprocess.run(cmd, check=True, env=env)
    except subprocess.CalledProcessError as e:
        # Mask token in any exception message
        raise RuntimeError(str(e).replace(repo_url, safe_repo_url)) from None

    elapsed = time.perf_counter() - t0

    # ─── 5.  Quick summary ─────────────────────────────────────────────────
    # Repo size on disk
    repo_size = sum(f.stat().st_size for f in dest.rglob('*')) / 1e6  # MB
    print(f"Clone completed in {elapsed:,.1f}s – size on disk ≈{repo_size:,.1f} MB")
    print("───────────────────────────────────────────────────────────────")

    return dest

In [3]:
clone_repo(
    repo_owner=REPO_OWNER,
    repo_name=REPO_NAME,
    branch=BRANCH_NAME,
    dest_dir=WORKSPACE_FOLDER,
    github_token=GITHUB_TOKEN
)

Directory clean_workspace Cleaned up!
─── Git clone ─────────────────────────────────────────────────
Repo      : TuringGpt/google-agents-api-gen
Branch   : rev22
Target dir: /home/guilherme/projects/turing/google-agents-api-gen/notebooks/rev22_preparation/clean_workspace


Cloning into '/home/guilherme/projects/turing/google-agents-api-gen/notebooks/rev22_preparation/clean_workspace'...


Clone completed in 6.8s – size on disk ≈65.3 MB
───────────────────────────────────────────────────────────────


PosixPath('/home/guilherme/projects/turing/google-agents-api-gen/notebooks/rev22_preparation/clean_workspace')