From b1c6c527f8b55a4bec535b70ceb5c930df0daafb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 13:51:35 +0000 Subject: [PATCH 1/8] Initial plan From eb1ab68b46bbb0bc5583de54b4160ee6238f4147 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 13:58:25 +0000 Subject: [PATCH 2/8] Add comprehensive GitHub App documentation for organization backup - Document GitHub App creation and configuration steps - Explain required permissions/scopes for organization backup - Add installation access token generation instructions - Include practical examples for using --as-app with organizations - Cover differences between personal access tokens and GitHub App tokens Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- README.rst | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) diff --git a/README.rst b/README.rst index 5dcef950..19715da0 100644 --- a/README.rst +++ b/README.rst @@ -171,6 +171,146 @@ Customise the permissions for your use case, but for a personal account full bac **Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks. +GitHub App Authentication +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For backing up entire organizations, **GitHub App authentication** (``--as-app``) is often the most effective approach as it provides broader access across organization repositories and higher rate limits. + +Creating a GitHub App for Organization Backup +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. **Create the GitHub App**: + + * Go to your organization's settings: ``https://github.com/organizations/YOUR_ORG/settings/apps`` + * Click "New GitHub App" + * Fill in basic information: + - App name: e.g., "Organization Backup Tool" + - Homepage URL: Can be your organization's website + - Webhook URL: Not required, can leave blank or use a placeholder + +2. **Configure Permissions**: + + **Repository permissions** (select "Read" access for): + + * Contents + * Issues + * Metadata + * Pull requests + * Webhooks + * Repository projects (if backing up projects) + + **Organization permissions** (select "Read" access for): + + * Members + * Metadata + + **Account permissions** (select "Read" access for): + + * Starring + * Watching + +3. **Installation Settings**: + + * Set "Where can this GitHub App be installed?" to "Only on this account" for security + * Under "Repository access", choose "All repositories" to backup the entire organization + +4. **Generate Keys**: + + * After creating the app, go to "General" tab and scroll down to "Private keys" + * Click "Generate a private key" and download the ``.pem`` file safely + +5. **Install the App**: + + * Go to "Install App" tab in your app settings + * Click "Install" next to your organization + * Choose "All repositories" or select specific repositories you want to backup + +Generating Installation Access Tokens +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +GitHub Apps use installation access tokens that expire after 1 hour. You'll need to generate these programmatically: + +**Option 1: Using GitHub CLI (recommended for manual runs)**:: + + # Install GitHub CLI if not already installed + # https://cli.github.com/ + + # Generate installation access token + gh auth token --hostname github.com --scopes repo + +**Option 2: Using a script for automation**: + +You can create a script to generate tokens using your app's private key. Here's a basic approach using Python:: + + import jwt + import time + import requests + + # Your GitHub App details + app_id = "YOUR_APP_ID" + private_key_path = "path/to/your/private-key.pem" + installation_id = "YOUR_INSTALLATION_ID" # Find this in app settings + + # Generate JWT + with open(private_key_path, 'r') as key_file: + private_key = key_file.read() + + payload = { + 'iat': int(time.time()), + 'exp': int(time.time()) + 600, # 10 minutes + 'iss': app_id + } + + jwt_token = jwt.encode(payload, private_key, algorithm='RS256') + + # Get installation access token + headers = { + 'Authorization': f'Bearer {jwt_token}', + 'Accept': 'application/vnd.github.v3+json' + } + + response = requests.post( + f'https://api.github.com/app/installations/{installation_id}/access_tokens', + headers=headers + ) + + installation_token = response.json()['token'] + +Using GitHub App for Organization Backup +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once you have an installation access token, use it with the ``--as-app`` flag:: + + # Full organization backup with GitHub App + export GITHUB_APP_TOKEN="ghs_xxxxxxxxxxxxxxxxxxxx" + github-backup YOUR_ORG \ + --token $GITHUB_APP_TOKEN \ + --as-app \ + --organization \ + --output-directory /backup/github-org \ + --all \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls + +**Key differences when using** ``--as-app``: + +* Higher rate limits (5000 requests/hour per installation) +* Access to all organization repositories (if app is installed with "All repositories") +* Uses ``Authorization: token `` header format +* Includes GitHub App API headers for proper app identification +* Works with organization-wide permissions + +**Important Notes**: + +* Installation access tokens expire after 1 hour - you may need to refresh them for long-running backups +* The app must be installed on the organization with appropriate repository access +* Use classic personal access tokens (``-t TOKEN_CLASSIC``) with ``--as-app``, not fine-grained tokens +* GitHub Apps have separate rate limits from personal access tokens + + Prefer SSH ~~~~~~~~~~ @@ -313,6 +453,86 @@ Debug an error/block or incomplete backup into a temporary directory. Omit "incr github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER +GitHub App Organization Backup Examples +======================================== + +Backup entire organization using GitHub App (recommended for organizations):: + + export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx # Installation access token + ORGANIZATION=mycompany + + github-backup $ORGANIZATION \ + --token $GITHUB_APP_TOKEN \ + --as-app \ + --organization \ + --output-directory /backup/github-org \ + --all \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments \ + --labels \ + --milestones + +Incremental organization backup with GitHub App for automated/cron scenarios:: + + export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx + ORGANIZATION=mycompany + + github-backup $ORGANIZATION \ + --token $GITHUB_APP_TOKEN \ + --as-app \ + --organization \ + --output-directory /backup/github-org \ + --incremental \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments \ + --labels \ + --milestones \ + --log-level error + +Backup specific organization repository with comprehensive data using GitHub App:: + + export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx + ORGANIZATION=mycompany + REPO=main-project + + github-backup $ORGANIZATION \ + --token $GITHUB_APP_TOKEN \ + --as-app \ + --organization \ + --repository $REPO \ + --output-directory /backup/specific-repo \ + --all \ + --private \ + --pull-details \ + --releases \ + --assets + +Organization backup excluding certain repositories:: + + export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx + ORGANIZATION=mycompany + + github-backup $ORGANIZATION \ + --token $GITHUB_APP_TOKEN \ + --as-app \ + --organization \ + --output-directory /backup/github-org \ + --all \ + --private \ + --exclude repo-to-skip another-repo-to-skip \ + --throttle-limit 4500 \ + --throttle-pause 0.8 + Development =========== From 08dbe62138a33afe37a744f6af2fe2f886bfcf32 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:13:39 +0000 Subject: [PATCH 3/8] Fix GitHub App token generation documentation for automated cron jobs - Replace incorrect GitHub CLI option with proper programmatic approach - Add complete Python script for generating installation access tokens - Include step-by-step cron job setup with environment variables - Document how to find App ID, Installation ID, and other credentials - Address client secret confusion by clarifying what credentials are needed Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- README.rst | 152 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 115 insertions(+), 37 deletions(-) diff --git a/README.rst b/README.rst index 19715da0..62c85188 100644 --- a/README.rst +++ b/README.rst @@ -214,67 +214,145 @@ Creating a GitHub App for Organization Backup * Set "Where can this GitHub App be installed?" to "Only on this account" for security * Under "Repository access", choose "All repositories" to backup the entire organization -4. **Generate Keys**: +4. **Generate Keys and Secret**: * After creating the app, go to "General" tab and scroll down to "Private keys" * Click "Generate a private key" and download the ``.pem`` file safely + * Note your **App ID** (displayed at the top of the General tab) + * Click "Generate a new client secret" and copy the client secret (you'll need this for automated scripts) 5. **Install the App**: * Go to "Install App" tab in your app settings * Click "Install" next to your organization * Choose "All repositories" or select specific repositories you want to backup + * Note the **Installation ID** from the URL after installation (e.g., ``https://github.com/organizations/ORG/settings/installations/12345678`` - the installation ID is ``12345678``) -Generating Installation Access Tokens -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Generating Installation Access Tokens for Automated Backups +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -GitHub Apps use installation access tokens that expire after 1 hour. You'll need to generate these programmatically: +GitHub Apps use installation access tokens that expire after 1 hour. For automated backups (e.g., cron jobs), you need to generate these tokens programmatically using your app's credentials. -**Option 1: Using GitHub CLI (recommended for manual runs)**:: +**Complete Script for Token Generation**: - # Install GitHub CLI if not already installed - # https://cli.github.com/ - - # Generate installation access token - gh auth token --hostname github.com --scopes repo - -**Option 2: Using a script for automation**: - -You can create a script to generate tokens using your app's private key. Here's a basic approach using Python:: +Create a script (e.g., ``generate-github-token.py``) to generate installation access tokens:: + #!/usr/bin/env python3 import jwt import time import requests + import os + import sys - # Your GitHub App details - app_id = "YOUR_APP_ID" - private_key_path = "path/to/your/private-key.pem" - installation_id = "YOUR_INSTALLATION_ID" # Find this in app settings + # Your GitHub App details - set these as environment variables or modify here + APP_ID = os.environ.get('GITHUB_APP_ID', 'YOUR_APP_ID') + PRIVATE_KEY_PATH = os.environ.get('GITHUB_PRIVATE_KEY_PATH', '/path/to/your/private-key.pem') + INSTALLATION_ID = os.environ.get('GITHUB_INSTALLATION_ID', 'YOUR_INSTALLATION_ID') - # Generate JWT - with open(private_key_path, 'r') as key_file: - private_key = key_file.read() + def generate_installation_token(): + # Read the private key + try: + with open(PRIVATE_KEY_PATH, 'r') as key_file: + private_key = key_file.read() + except FileNotFoundError: + print(f"Error: Private key file not found at {PRIVATE_KEY_PATH}") + sys.exit(1) + + # Generate JWT token + now = int(time.time()) + payload = { + 'iat': now - 60, # Issued 1 minute in the past to avoid clock drift + 'exp': now + 600, # Expires in 10 minutes + 'iss': APP_ID + } + + try: + jwt_token = jwt.encode(payload, private_key, algorithm='RS256') + except Exception as e: + print(f"Error generating JWT: {e}") + sys.exit(1) + + # Get installation access token + headers = { + 'Authorization': f'Bearer {jwt_token}', + 'Accept': 'application/vnd.github.v3+json', + 'X-GitHub-Api-Version': '2022-11-28' + } + + try: + response = requests.post( + f'https://api.github.com/app/installations/{INSTALLATION_ID}/access_tokens', + headers=headers + ) + response.raise_for_status() + return response.json()['token'] + except requests.exceptions.RequestException as e: + print(f"Error getting installation token: {e}") + if response.status_code == 404: + print("Check your installation ID - the app may not be installed or ID is incorrect") + sys.exit(1) - payload = { - 'iat': int(time.time()), - 'exp': int(time.time()) + 600, # 10 minutes - 'iss': app_id - } + if __name__ == '__main__': + token = generate_installation_token() + print(token) + +**Setup for Automated Cron Jobs**: + +1. **Install required Python packages**:: + + pip install PyJWT requests + +2. **Set up environment variables** (in your cron environment or script):: + + export GITHUB_APP_ID="123456" + export GITHUB_PRIVATE_KEY_PATH="/secure/path/to/github-app-private-key.pem" + export GITHUB_INSTALLATION_ID="12345678" + +3. **Create a backup script** (e.g., ``nightly-backup.sh``):: + + #!/bin/bash + set -e - jwt_token = jwt.encode(payload, private_key, algorithm='RS256') + # Generate fresh GitHub App installation token + GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) - # Get installation access token - headers = { - 'Authorization': f'Bearer {jwt_token}', - 'Accept': 'application/vnd.github.v3+json' - } + if [ -z "$GITHUB_APP_TOKEN" ]; then + echo "Failed to generate GitHub App token" + exit 1 + fi - response = requests.post( - f'https://api.github.com/app/installations/{installation_id}/access_tokens', - headers=headers - ) + # Run the backup + github-backup YOUR_ORGANIZATION \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory /backup/github-org \ + --incremental \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments \ + --labels \ + --milestones \ + --log-level error + +4. **Add to crontab for nightly runs**:: + + # Edit crontab + crontab -e - installation_token = response.json()['token'] + # Add this line for nightly backup at 2 AM + 0 2 * * * /path/to/nightly-backup.sh >> /var/log/github-backup.log 2>&1 + +**Finding Your App Credentials**: + +* **App ID**: Found in your GitHub App settings under "General" tab (top of page) +* **Installation ID**: Found in the URL after installing the app: ``https://github.com/organizations/YOUR_ORG/settings/installations/INSTALLATION_ID`` +* **Private Key**: Downloaded as ``.pem`` file when you generate it in app settings +* **Client Secret**: Generated in app settings (not needed for this token generation method) Using GitHub App for Organization Backup ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 95358a41d341513097a836c67eb69156727a4ddf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:32:15 +0000 Subject: [PATCH 4/8] Add comprehensive token expiry handling and classic vs fine-grained token explanation Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- README.rst | 238 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/README.rst b/README.rst index 62c85188..33a1599f 100644 --- a/README.rst +++ b/README.rst @@ -388,6 +388,244 @@ Once you have an installation access token, use it with the ``--as-app`` flag:: * Use classic personal access tokens (``-t TOKEN_CLASSIC``) with ``--as-app``, not fine-grained tokens * GitHub Apps have separate rate limits from personal access tokens +Token Type Selection: Classic vs Fine-Grained +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Why Classic Tokens with GitHub Apps?** + +When using ``--as-app``, you must use classic personal access tokens (``-t TOKEN_CLASSIC``) rather than fine-grained tokens (``-f TOKEN_FINE``) for the following technical reasons: + +1. **GitHub App Installation Tokens are Classic Format**: Installation access tokens generated by GitHub Apps follow the classic token format (``ghs_`` prefix), not the fine-grained format +2. **API Compatibility**: The ``--as-app`` flag configures the tool to use GitHub App-specific API headers and authentication methods that expect classic token format +3. **Scope Differences**: Fine-grained tokens are designed for user-scoped access to specific repositories, while GitHub App installation tokens provide organization-wide access with app-specific permissions + +**Fine-grained tokens** are intended for: +- User personal access with repository-specific scopes +- Direct user authentication (not app authentication) +- Newer, more granular permission model + +**Classic tokens** (including GitHub App installation tokens) are used for: +- Application-based authentication (``--as-app``) +- Organization-wide access patterns +- Legacy API compatibility requirements + +Handling Long-Running Backups and Token Expiry +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**The 1-Hour Token Expiry Challenge** + +GitHub App installation access tokens expire after exactly 1 hour. For large organization backups that may take several hours, this creates a potential problem: + +* **What happens during expiry**: When the token expires mid-backup, GitHub API requests will start returning ``401 Unauthorized`` errors +* **Impact on backup**: The backup process will fail and exit, potentially leaving an incomplete backup +* **Data integrity**: Depending on when the expiry occurs, you may have partial repository clones, incomplete issue data, or missing metadata + +**Strategies for Long-Running Backups** + +**1. Pre-emptive Token Refresh Strategy** + +Create a wrapper script that monitors backup duration and refreshes tokens proactively:: + + #!/bin/bash + # long-running-backup.sh + set -e + + ORGANIZATION="$1" + BACKUP_DIR="$2" + GITHUB_APP_TOKEN="" + BACKUP_PID="" + + # Function to generate fresh token + generate_token() { + echo "Generating fresh GitHub App token..." + GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) + if [ -z "$GITHUB_APP_TOKEN" ]; then + echo "Failed to generate token" + exit 1 + fi + echo "Token generated successfully" + } + + # Function to start backup in background + start_backup() { + echo "Starting backup process..." + github-backup "$ORGANIZATION" \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory "$BACKUP_DIR" \ + --incremental \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments \ + --labels \ + --milestones \ + --log-level info & + BACKUP_PID=$! + echo "Backup started with PID: $BACKUP_PID" + } + + # Main backup loop with token refresh + run_backup_with_refresh() { + generate_token + start_backup + + # Monitor backup and refresh token every 50 minutes (before 1-hour expiry) + while kill -0 $BACKUP_PID 2>/dev/null; do + echo "Backup running... waiting 50 minutes before token refresh" + sleep 3000 # 50 minutes + + if kill -0 $BACKUP_PID 2>/dev/null; then + echo "Backup still running, killing to refresh token..." + kill $BACKUP_PID + wait $BACKUP_PID 2>/dev/null || true + + # Generate new token and restart + generate_token + start_backup + fi + done + + wait $BACKUP_PID + echo "Backup completed successfully" + } + + # Usage: ./long-running-backup.sh myorg /backup/path + run_backup_with_refresh + +**2. Segmented Backup Strategy** + +Break large backups into smaller chunks that complete within the token lifetime:: + + #!/bin/bash + # segmented-backup.sh + set -e + + ORGANIZATION="$1" + BACKUP_DIR="$2" + + # Generate fresh token for each segment + generate_token() { + python3 /path/to/generate-github-token.py + } + + # Backup repositories only (usually the longest part) + echo "=== Backing up repositories ===" + GITHUB_APP_TOKEN=$(generate_token) + github-backup "$ORGANIZATION" \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory "$BACKUP_DIR" \ + --incremental \ + --private \ + --repositories \ + --wikis + + # Backup issues and pull requests + echo "=== Backing up issues and pulls ===" + GITHUB_APP_TOKEN=$(generate_token) + github-backup "$ORGANIZATION" \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory "$BACKUP_DIR" \ + --incremental \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments + + # Backup metadata + echo "=== Backing up metadata ===" + GITHUB_APP_TOKEN=$(generate_token) + github-backup "$ORGANIZATION" \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory "$BACKUP_DIR" \ + --incremental \ + --labels \ + --milestones + + echo "Segmented backup completed" + +**3. Error-Resilient Incremental Strategy** + +Use incremental backups with error handling to resume from failures:: + + #!/bin/bash + # resilient-backup.sh + set -e + + ORGANIZATION="$1" + BACKUP_DIR="$2" + MAX_RETRIES=3 + + run_backup_with_retry() { + local attempt=1 + + while [ $attempt -le $MAX_RETRIES ]; do + echo "Backup attempt $attempt of $MAX_RETRIES" + + # Generate fresh token for each attempt + GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) + + if github-backup "$ORGANIZATION" \ + --token "$GITHUB_APP_TOKEN" \ + --as-app \ + --organization \ + --output-directory "$BACKUP_DIR" \ + --incremental \ + --private \ + --repositories \ + --wikis \ + --issues \ + --pulls \ + --issue-comments \ + --pull-comments \ + --labels \ + --milestones \ + --log-level info; then + echo "Backup completed successfully on attempt $attempt" + return 0 + else + echo "Backup failed on attempt $attempt" + if [ $attempt -eq $MAX_RETRIES ]; then + echo "All retry attempts exhausted" + return 1 + fi + attempt=$((attempt + 1)) + echo "Waiting 2 minutes before retry..." + sleep 120 + fi + done + } + + run_backup_with_retry + +**Recommended Approach for Production** + +For automated nightly backups, the **segmented backup strategy** is recommended because: + +1. **Predictable timing**: Each segment completes well within 1 hour +2. **Clear progress**: You can see which parts completed successfully +3. **Efficient recovery**: If one segment fails, you don't need to restart everything +4. **Resource friendly**: Uses incremental backups to minimize repeated work + +**Monitoring Token Expiry** + +To detect token expiry issues in your logs, watch for these error patterns:: + + # In your backup logs, look for: + grep -i "401\|unauthorized\|token.*expired\|authentication.*failed" /var/log/github-backup.log + +Set up alerting on these patterns to get notified when token refresh is needed. + Prefer SSH ~~~~~~~~~~ From 62a03ae834fffbf3e31d3093bb211c3785710952 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:50:20 +0000 Subject: [PATCH 5/8] Add GitHub App credential arguments and internal token generation Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- bin/github-backup | 2 + github_backup/github_backup.py | 222 +++++++++++++++++++++++++++++++-- 2 files changed, 216 insertions(+), 8 deletions(-) diff --git a/bin/github-backup b/bin/github-backup index b33d19ff..555c119a 100755 --- a/bin/github-backup +++ b/bin/github-backup @@ -14,6 +14,7 @@ from github_backup.github_backup import ( mkdir_p, parse_args, retrieve_repositories, + validate_args, ) logging.basicConfig( @@ -25,6 +26,7 @@ logging.basicConfig( def main(): args = parse_args() + validate_args(args) if args.quiet: logger.setLevel(logging.WARNING) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 4b2d7905..934db740 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -19,13 +19,16 @@ import subprocess import sys import time -from datetime import datetime +from datetime import datetime, timedelta from http.client import IncompleteRead from urllib.error import HTTPError, URLError from urllib.parse import quote as urlquote from urllib.parse import urlencode, urlparse from urllib.request import HTTPRedirectHandler, Request, build_opener, urlopen +# GitHub App authentication imports +import jwt + try: from . import __version__ @@ -37,6 +40,11 @@ FILE_URI_PREFIX = "file://" logger = logging.getLogger(__name__) +# Global variables for GitHub App token management +_github_app_token = None +_github_app_token_expires = None +_github_app_credentials = None + https_ctx = ssl.create_default_context() if not https_ctx.get_ca_certs(): import warnings @@ -160,6 +168,21 @@ def parse_args(args=None): dest="as_app", help="authenticate as github app instead of as a user.", ) + parser.add_argument( + "--app-id", + dest="app_id", + help="GitHub App ID for app authentication", + ) + parser.add_argument( + "--installation-id", + dest="installation_id", + help="GitHub App Installation ID for app authentication", + ) + parser.add_argument( + "--private-key", + dest="private_key", + help="GitHub App private key (PEM format) or path to private key file (file://...)", + ) parser.add_argument( "-o", "--output-directory", @@ -440,9 +463,81 @@ def parse_args(args=None): return parser.parse_args(args) +def validate_args(args): + """Validate argument combinations and dependencies.""" + # Auto-enable --as-app when GitHub App credentials are provided + if args.app_id and args.installation_id and args.private_key: + if not args.as_app: + logger.info("GitHub App credentials provided. Automatically enabling --as-app mode.") + args.as_app = True + + # GitHub App authentication validation + if args.as_app: + # Check if user provided GitHub App credentials + app_creds_provided = bool(args.app_id and args.installation_id and args.private_key) + # Check if user provided a token + token_provided = bool(args.token_classic) + + if not app_creds_provided and not token_provided: + raise Exception( + "When using --as-app, you must provide either:\n" + " 1. GitHub App credentials: --app-id, --installation-id, --private-key, OR\n" + " 2. A pre-generated installation token: --token" + ) + + if app_creds_provided and token_provided: + raise Exception( + "Cannot use both GitHub App credentials (--app-id, --installation-id, --private-key) " + "and pre-generated token (--token) simultaneously. Choose one approach." + ) + + # Validate that GitHub App credentials are complete if any are provided + app_cred_args = [args.app_id, args.installation_id, args.private_key] + app_creds_partial = any(app_cred_args) and not all(app_cred_args) + + if app_creds_partial: + missing = [] + if not args.app_id: + missing.append("--app-id") + if not args.installation_id: + missing.append("--installation-id") + if not args.private_key: + missing.append("--private-key") + + raise Exception( + f"Incomplete GitHub App credentials. Missing: {', '.join(missing)}\n" + "All three are required: --app-id, --installation-id, --private-key" + ) + + def get_auth(args, encode=True, for_git_cli=False): + global _github_app_credentials auth = None + # Handle GitHub App authentication + if args.app_id and args.installation_id and args.private_key: + if not args.as_app: + logger.warning("GitHub App credentials provided but --as-app not specified. Enabling app authentication.") + args.as_app = True + + # Store credentials globally for token refresh + _github_app_credentials = (args.app_id, args.installation_id, args.private_key) + + # Get fresh token + token = get_or_refresh_github_app_token() + if not token: + raise Exception("Failed to generate GitHub App installation token") + + if not for_git_cli: + auth = token + else: + auth = "x-access-token:" + token + + # For GitHub App tokens, we don't need to encode + if not encode or not for_git_cli: + return auth + return base64.b64encode(auth.encode("ascii")) + if args.osx_keychain_item_name: if not args.osx_keychain_item_account: raise Exception( @@ -516,6 +611,71 @@ def get_auth(args, encode=True, for_git_cli=False): return base64.b64encode(auth.encode("ascii")) +def generate_github_app_token(app_id, installation_id, private_key): + """Generate an installation access token for GitHub App authentication.""" + try: + # Load private key + if private_key.startswith(FILE_URI_PREFIX): + private_key = read_file_contents(private_key) + + # Create JWT payload + now = int(time.time()) + payload = { + "iat": now - 60, # Issued at (1 minute ago to account for clock skew) + "exp": now + 600, # Expires in 10 minutes (max allowed) + "iss": int(app_id) # Issuer (GitHub App ID) + } + + # Generate JWT + jwt_token = jwt.encode(payload, private_key, algorithm="RS256") + + # Request installation access token + url = f"https://api.github.com/app/installations/{installation_id}/access_tokens" + headers = { + "Authorization": f"Bearer {jwt_token}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": f"github-backup/{VERSION}" + } + + request = Request(url, headers=headers, method="POST") + request.data = b"" # Empty POST body + + response = urlopen(request, context=https_ctx) + data = json.loads(response.read().decode("utf-8")) + + token = data["token"] + expires_at = datetime.fromisoformat(data["expires_at"].replace("Z", "+00:00")) + + logger.info(f"Generated GitHub App installation token (expires at {expires_at})") + return token, expires_at + + except Exception as e: + raise Exception(f"Failed to generate GitHub App token: {str(e)}") + + +def get_or_refresh_github_app_token(): + """Get current GitHub App token or refresh it if expired/missing.""" + global _github_app_token, _github_app_token_expires, _github_app_credentials + + if not _github_app_credentials: + return None + + app_id, installation_id, private_key = _github_app_credentials + + # Check if we need a new token (5 minutes buffer before expiry) + now = datetime.now().replace(tzinfo=None) + if (_github_app_token is None or + _github_app_token_expires is None or + now >= (_github_app_token_expires.replace(tzinfo=None) - timedelta(minutes=5))): + + logger.info("Generating new GitHub App token...") + _github_app_token, _github_app_token_expires = generate_github_app_token( + app_id, installation_id, private_key + ) + + return _github_app_token + + def get_github_api_host(args): if args.github_host: host = args.github_host + "/api/v3" @@ -572,7 +732,6 @@ def get_github_repo_url(args, repository): def retrieve_data_gen(args, template, query_args=None, single_request=False): - auth = get_auth(args, encode=not args.as_app) query_args = get_query_args(query_args) per_page = 100 page = 0 @@ -584,16 +743,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False): page = page + 1 request_page, request_per_page = page, per_page + # Get fresh auth on each request to handle token refresh + auth = get_auth(args, encode=not args.as_app) + request = _construct_request( - request_per_page, request_page, + request_per_page, query_args, template, auth, as_app=args.as_app, fine=True if args.token_fine is not None else False, ) # noqa - r, errors = _get_response(request, auth, template) + r, errors = _get_response(request, auth, template, args) status_code = int(r.getcode()) # Check if we got correct data @@ -687,7 +849,7 @@ def get_query_args(query_args=None): return query_args -def _get_response(request, auth, template): +def _get_response(request, auth, template, args=None): retry_timeout = 3 errors = [] # We'll make requests in a loop so we can @@ -697,8 +859,35 @@ def _get_response(request, auth, template): try: r = urlopen(request, context=https_ctx) except HTTPError as exc: - errors, should_continue = _request_http_error(exc, auth, errors) # noqa + errors, should_continue = _request_http_error(exc, auth, errors, args) # noqa r = exc + + # If token was refreshed, we need to reconstruct the request with new auth + if should_continue and args and _github_app_credentials: + new_auth = get_auth(args, encode=not args.as_app) + if new_auth != auth: + # Extract the original URL from the request + original_url = request.get_full_url() + + # Parse URL to get query parameters + parsed_url = urlparse(original_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + + # Reconstruct the request with new auth + request = Request(original_url) + + # Set the new authorization header + if not args.as_app: + request.add_header("Authorization", f"Basic {new_auth.decode('ascii')}") + else: + if args.token_fine: + request.add_header("Authorization", f"token {new_auth}") + else: + request.add_header("Authorization", f"token {new_auth}") + + request.add_header("User-Agent", f"github-backup/{VERSION}") + auth = new_auth # Update local auth variable + except URLError as e: logger.warning(e.reason) should_continue, retry_timeout = _request_url_error(template, retry_timeout) @@ -756,7 +945,7 @@ def _construct_request( return request -def _request_http_error(exc, auth, errors): +def _request_http_error(exc, auth, errors, args=None): # HTTPError behaves like a Response so we can # check the status code and headers to see exactly # what failed. @@ -765,7 +954,24 @@ def _request_http_error(exc, auth, errors): headers = exc.headers limit_remaining = int(headers.get("x-ratelimit-remaining", 0)) - if exc.code == 403 and limit_remaining < 1: + # Handle GitHub App token expiry (401 Unauthorized) + if exc.code == 401 and _github_app_credentials is not None: + logger.warning("GitHub App token expired (401 Unauthorized). Refreshing token...") + try: + # Force refresh the token + global _github_app_token, _github_app_token_expires + _github_app_token = None # Force regeneration + _github_app_token_expires = None + + new_token = get_or_refresh_github_app_token() + if new_token: + logger.info("Successfully refreshed GitHub App token") + should_continue = True + else: + logger.error("Failed to refresh GitHub App token") + except Exception as e: + logger.error(f"Error refreshing GitHub App token: {str(e)}") + elif exc.code == 403 and limit_remaining < 1: # The X-RateLimit-Reset header includes a # timestamp telling us when the limit will reset # so we can calculate how long to wait rather From 3296835e42f4a4e9f712052c79dce57f72f78445 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:56:50 +0000 Subject: [PATCH 6/8] Simplify GitHub App documentation and add PyJWT dependency Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- README.rst | 632 ++++----------------------------- github_backup/github_backup.py | 8 +- requirements.txt | 3 +- 3 files changed, 81 insertions(+), 562 deletions(-) diff --git a/README.rst b/README.rst index 33a1599f..3121d56d 100644 --- a/README.rst +++ b/README.rst @@ -171,10 +171,16 @@ Customise the permissions for your use case, but for a personal account full bac **Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks. + GitHub App Authentication ~~~~~~~~~~~~~~~~~~~~~~~~~~ -For backing up entire organizations, **GitHub App authentication** (``--as-app``) is often the most effective approach as it provides broader access across organization repositories and higher rate limits. +For backing up entire organizations, **GitHub App authentication** (``--as-app``) is the recommended approach as it provides: + +* **Higher rate limits**: 5000 requests/hour per installation vs standard personal token limits +* **Broader access**: Organization-wide repository access when installed with "All repositories" +* **Enterprise-friendly**: Proper app-based authentication for organizational backup scenarios +* **Automated token management**: No need to manually handle token expiry during long backups Creating a GitHub App for Organization Backup ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -214,529 +220,62 @@ Creating a GitHub App for Organization Backup * Set "Where can this GitHub App be installed?" to "Only on this account" for security * Under "Repository access", choose "All repositories" to backup the entire organization -4. **Generate Keys and Secret**: +4. **Generate Private Key**: - * After creating the app, go to "General" tab and scroll down to "Private keys" - * Click "Generate a private key" and download the ``.pem`` file safely - * Note your **App ID** (displayed at the top of the General tab) - * Click "Generate a new client secret" and copy the client secret (you'll need this for automated scripts) + * After creating the app, scroll down to "Private keys" section + * Click "Generate a private key" + * Download the ``.pem`` file and store it securely 5. **Install the App**: - * Go to "Install App" tab in your app settings + * Go to the "Install App" tab in your GitHub App settings * Click "Install" next to your organization - * Choose "All repositories" or select specific repositories you want to backup - * Note the **Installation ID** from the URL after installation (e.g., ``https://github.com/organizations/ORG/settings/installations/12345678`` - the installation ID is ``12345678``) - -Generating Installation Access Tokens for Automated Backups -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -GitHub Apps use installation access tokens that expire after 1 hour. For automated backups (e.g., cron jobs), you need to generate these tokens programmatically using your app's credentials. - -**Complete Script for Token Generation**: - -Create a script (e.g., ``generate-github-token.py``) to generate installation access tokens:: - - #!/usr/bin/env python3 - import jwt - import time - import requests - import os - import sys - - # Your GitHub App details - set these as environment variables or modify here - APP_ID = os.environ.get('GITHUB_APP_ID', 'YOUR_APP_ID') - PRIVATE_KEY_PATH = os.environ.get('GITHUB_PRIVATE_KEY_PATH', '/path/to/your/private-key.pem') - INSTALLATION_ID = os.environ.get('GITHUB_INSTALLATION_ID', 'YOUR_INSTALLATION_ID') - - def generate_installation_token(): - # Read the private key - try: - with open(PRIVATE_KEY_PATH, 'r') as key_file: - private_key = key_file.read() - except FileNotFoundError: - print(f"Error: Private key file not found at {PRIVATE_KEY_PATH}") - sys.exit(1) - - # Generate JWT token - now = int(time.time()) - payload = { - 'iat': now - 60, # Issued 1 minute in the past to avoid clock drift - 'exp': now + 600, # Expires in 10 minutes - 'iss': APP_ID - } - - try: - jwt_token = jwt.encode(payload, private_key, algorithm='RS256') - except Exception as e: - print(f"Error generating JWT: {e}") - sys.exit(1) - - # Get installation access token - headers = { - 'Authorization': f'Bearer {jwt_token}', - 'Accept': 'application/vnd.github.v3+json', - 'X-GitHub-Api-Version': '2022-11-28' - } - - try: - response = requests.post( - f'https://api.github.com/app/installations/{INSTALLATION_ID}/access_tokens', - headers=headers - ) - response.raise_for_status() - return response.json()['token'] - except requests.exceptions.RequestException as e: - print(f"Error getting installation token: {e}") - if response.status_code == 404: - print("Check your installation ID - the app may not be installed or ID is incorrect") - sys.exit(1) - - if __name__ == '__main__': - token = generate_installation_token() - print(token) - -**Setup for Automated Cron Jobs**: - -1. **Install required Python packages**:: - - pip install PyJWT requests - -2. **Set up environment variables** (in your cron environment or script):: - - export GITHUB_APP_ID="123456" - export GITHUB_PRIVATE_KEY_PATH="/secure/path/to/github-app-private-key.pem" - export GITHUB_INSTALLATION_ID="12345678" - -3. **Create a backup script** (e.g., ``nightly-backup.sh``):: - - #!/bin/bash - set -e - - # Generate fresh GitHub App installation token - GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) - - if [ -z "$GITHUB_APP_TOKEN" ]; then - echo "Failed to generate GitHub App token" - exit 1 - fi - - # Run the backup - github-backup YOUR_ORGANIZATION \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ - --organization \ - --output-directory /backup/github-org \ - --incremental \ - --private \ - --repositories \ - --wikis \ - --issues \ - --pulls \ - --issue-comments \ - --pull-comments \ - --labels \ - --milestones \ - --log-level error - -4. **Add to crontab for nightly runs**:: - - # Edit crontab - crontab -e - - # Add this line for nightly backup at 2 AM - 0 2 * * * /path/to/nightly-backup.sh >> /var/log/github-backup.log 2>&1 - -**Finding Your App Credentials**: + * Choose "All repositories" for comprehensive backup access -* **App ID**: Found in your GitHub App settings under "General" tab (top of page) -* **Installation ID**: Found in the URL after installing the app: ``https://github.com/organizations/YOUR_ORG/settings/installations/INSTALLATION_ID`` -* **Private Key**: Downloaded as ``.pem`` file when you generate it in app settings -* **Client Secret**: Generated in app settings (not needed for this token generation method) +6. **Get Required Information**: + + * **App ID**: Found in your GitHub App settings under "General" tab (the number at the top) + * **Installation ID**: After installing, the URL will show the installation ID: ``/organizations/YOUR_ORG/settings/installations/INSTALLATION_ID`` + * **Private Key**: The ``.pem`` file you downloaded -Using GitHub App for Organization Backup -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Using GitHub App Authentication +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once you have an installation access token, use it with the ``--as-app`` flag:: +With the GitHub App created and installed, you can use it directly with github-backup:: - # Full organization backup with GitHub App - export GITHUB_APP_TOKEN="ghs_xxxxxxxxxxxxxxxxxxxx" github-backup YOUR_ORG \ - --token $GITHUB_APP_TOKEN \ - --as-app \ + --app-id 123456 \ + --installation-id 789012 \ + --private-key /path/to/your-app.pem \ --organization \ - --output-directory /backup/github-org \ - --all \ - --private \ --repositories \ - --wikis \ - --issues \ - --pulls - -**Key differences when using** ``--as-app``: - -* Higher rate limits (5000 requests/hour per installation) -* Access to all organization repositories (if app is installed with "All repositories") -* Uses ``Authorization: token `` header format -* Includes GitHub App API headers for proper app identification -* Works with organization-wide permissions - -**Important Notes**: - -* Installation access tokens expire after 1 hour - you may need to refresh them for long-running backups -* The app must be installed on the organization with appropriate repository access -* Use classic personal access tokens (``-t TOKEN_CLASSIC``) with ``--as-app``, not fine-grained tokens -* GitHub Apps have separate rate limits from personal access tokens - -Token Type Selection: Classic vs Fine-Grained -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -**Why Classic Tokens with GitHub Apps?** - -When using ``--as-app``, you must use classic personal access tokens (``-t TOKEN_CLASSIC``) rather than fine-grained tokens (``-f TOKEN_FINE``) for the following technical reasons: - -1. **GitHub App Installation Tokens are Classic Format**: Installation access tokens generated by GitHub Apps follow the classic token format (``ghs_`` prefix), not the fine-grained format -2. **API Compatibility**: The ``--as-app`` flag configures the tool to use GitHub App-specific API headers and authentication methods that expect classic token format -3. **Scope Differences**: Fine-grained tokens are designed for user-scoped access to specific repositories, while GitHub App installation tokens provide organization-wide access with app-specific permissions - -**Fine-grained tokens** are intended for: -- User personal access with repository-specific scopes -- Direct user authentication (not app authentication) -- Newer, more granular permission model - -**Classic tokens** (including GitHub App installation tokens) are used for: -- Application-based authentication (``--as-app``) -- Organization-wide access patterns -- Legacy API compatibility requirements - -Handling Long-Running Backups and Token Expiry -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -**The 1-Hour Token Expiry Challenge** - -GitHub App installation access tokens expire after exactly 1 hour. For large organization backups that may take several hours, this creates a potential problem: - -* **What happens during expiry**: When the token expires mid-backup, GitHub API requests will start returning ``401 Unauthorized`` errors -* **Impact on backup**: The backup process will fail and exit, potentially leaving an incomplete backup -* **Data integrity**: Depending on when the expiry occurs, you may have partial repository clones, incomplete issue data, or missing metadata - -**Strategies for Long-Running Backups** - -**1. Pre-emptive Token Refresh Strategy** - -Create a wrapper script that monitors backup duration and refreshes tokens proactively:: - - #!/bin/bash - # long-running-backup.sh - set -e - - ORGANIZATION="$1" - BACKUP_DIR="$2" - GITHUB_APP_TOKEN="" - BACKUP_PID="" - - # Function to generate fresh token - generate_token() { - echo "Generating fresh GitHub App token..." - GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) - if [ -z "$GITHUB_APP_TOKEN" ]; then - echo "Failed to generate token" - exit 1 - fi - echo "Token generated successfully" - } - - # Function to start backup in background - start_backup() { - echo "Starting backup process..." - github-backup "$ORGANIZATION" \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ - --organization \ - --output-directory "$BACKUP_DIR" \ - --incremental \ - --private \ - --repositories \ - --wikis \ - --issues \ - --pulls \ - --issue-comments \ - --pull-comments \ - --labels \ - --milestones \ - --log-level info & - BACKUP_PID=$! - echo "Backup started with PID: $BACKUP_PID" - } - - # Main backup loop with token refresh - run_backup_with_refresh() { - generate_token - start_backup - - # Monitor backup and refresh token every 50 minutes (before 1-hour expiry) - while kill -0 $BACKUP_PID 2>/dev/null; do - echo "Backup running... waiting 50 minutes before token refresh" - sleep 3000 # 50 minutes - - if kill -0 $BACKUP_PID 2>/dev/null; then - echo "Backup still running, killing to refresh token..." - kill $BACKUP_PID - wait $BACKUP_PID 2>/dev/null || true - - # Generate new token and restart - generate_token - start_backup - fi - done - - wait $BACKUP_PID - echo "Backup completed successfully" - } - - # Usage: ./long-running-backup.sh myorg /backup/path - run_backup_with_refresh + --output-directory /tmp/backup -**2. Segmented Backup Strategy** +Or using environment variables for security:: -Break large backups into smaller chunks that complete within the token lifetime:: - - #!/bin/bash - # segmented-backup.sh - set -e - - ORGANIZATION="$1" - BACKUP_DIR="$2" - - # Generate fresh token for each segment - generate_token() { - python3 /path/to/generate-github-token.py - } + export GITHUB_APP_ID=123456 + export GITHUB_INSTALLATION_ID=789012 + export GITHUB_PRIVATE_KEY=/path/to/your-app.pem - # Backup repositories only (usually the longest part) - echo "=== Backing up repositories ===" - GITHUB_APP_TOKEN=$(generate_token) - github-backup "$ORGANIZATION" \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ + github-backup YOUR_ORG \ + --app-id $GITHUB_APP_ID \ + --installation-id $GITHUB_INSTALLATION_ID \ + --private-key $GITHUB_PRIVATE_KEY \ --organization \ - --output-directory "$BACKUP_DIR" \ - --incremental \ - --private \ --repositories \ - --wikis - - # Backup issues and pull requests - echo "=== Backing up issues and pulls ===" - GITHUB_APP_TOKEN=$(generate_token) - github-backup "$ORGANIZATION" \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ - --organization \ - --output-directory "$BACKUP_DIR" \ - --incremental \ - --issues \ - --pulls \ - --issue-comments \ - --pull-comments - - # Backup metadata - echo "=== Backing up metadata ===" - GITHUB_APP_TOKEN=$(generate_token) - github-backup "$ORGANIZATION" \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ - --organization \ - --output-directory "$BACKUP_DIR" \ - --incremental \ - --labels \ - --milestones - - echo "Segmented backup completed" + --all -**3. Error-Resilient Incremental Strategy** - -Use incremental backups with error handling to resume from failures:: - - #!/bin/bash - # resilient-backup.sh - set -e - - ORGANIZATION="$1" - BACKUP_DIR="$2" - MAX_RETRIES=3 - - run_backup_with_retry() { - local attempt=1 - - while [ $attempt -le $MAX_RETRIES ]; do - echo "Backup attempt $attempt of $MAX_RETRIES" - - # Generate fresh token for each attempt - GITHUB_APP_TOKEN=$(python3 /path/to/generate-github-token.py) - - if github-backup "$ORGANIZATION" \ - --token "$GITHUB_APP_TOKEN" \ - --as-app \ - --organization \ - --output-directory "$BACKUP_DIR" \ - --incremental \ - --private \ - --repositories \ - --wikis \ - --issues \ - --pulls \ - --issue-comments \ - --pull-comments \ - --labels \ - --milestones \ - --log-level info; then - echo "Backup completed successfully on attempt $attempt" - return 0 - else - echo "Backup failed on attempt $attempt" - if [ $attempt -eq $MAX_RETRIES ]; then - echo "All retry attempts exhausted" - return 1 - fi - attempt=$((attempt + 1)) - echo "Waiting 2 minutes before retry..." - sleep 120 - fi - done - } - - run_backup_with_retry +**Key Benefits**: -**Recommended Approach for Production** +* **Automatic token management**: The tool automatically generates and refreshes installation access tokens as needed +* **No manual token handling**: No need for external scripts or cron job token generation +* **Handles long backups**: Token expiry is automatically handled during multi-hour organization backups +* **Docker-friendly**: Simple to use in containerized environments with mounted private key files -For automated nightly backups, the **segmented backup strategy** is recommended because: - -1. **Predictable timing**: Each segment completes well within 1 hour -2. **Clear progress**: You can see which parts completed successfully -3. **Efficient recovery**: If one segment fails, you don't need to restart everything -4. **Resource friendly**: Uses incremental backups to minimize repeated work - -**Monitoring Token Expiry** - -To detect token expiry issues in your logs, watch for these error patterns:: - - # In your backup logs, look for: - grep -i "401\|unauthorized\|token.*expired\|authentication.*failed" /var/log/github-backup.log - -Set up alerting on these patterns to get notified when token refresh is needed. - - -Prefer SSH -~~~~~~~~~~ - -If cloning repos is enabled with ``--repositories``, ``--all-starred``, ``--wikis``, ``--gists``, ``--starred-gists`` using the ``--prefer-ssh`` argument will use ssh for cloning the git repos, but all other connections will still use their own protocol, e.g. API requests for issues uses HTTPS. - -To clone with SSH, you'll need SSH authentication setup `as usual with Github `_, e.g. via SSH public and private keys. - - -Using the Keychain on Mac OSX -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Note: On Mac OSX the token can be stored securely in the user's keychain. To do this: - -1. Open Keychain from "Applications -> Utilities -> Keychain Access" -2. Add a new password item using "File -> New Password Item" -3. Enter a name in the "Keychain Item Name" box. You must provide this name to github-backup using the --keychain-name argument. -4. Enter an account name in the "Account Name" box, enter your Github username as set above. You must provide this name to github-backup using the --keychain-account argument. -5. Enter your Github personal access token in the "Password" box - -Note: When you run github-backup, you will be asked whether you want to allow "security" to use your confidential information stored in your keychain. You have two options: - -1. **Allow:** In this case you will need to click "Allow" each time you run `github-backup` -2. **Always Allow:** In this case, you will not be asked for permission when you run `github-backup` in future. This is less secure, but is required if you want to schedule `github-backup` to run automatically - - -Github Rate-limit and Throttling --------------------------------- - -"github-backup" will automatically throttle itself based on feedback from the Github API. - -Their API is usually rate-limited to 5000 calls per hour. The API will ask github-backup to pause until a specific time when the limit is reset again (at the start of the next hour). This continues until the backup is complete. - -During a large backup, such as ``--all-starred``, and on a fast connection this can result in (~20 min) pauses with bursts of API calls periodically maxing out the API limit. If this is not suitable `it has been observed `_ under real-world conditions that overriding the throttle with ``--throttle-limit 5000 --throttle-pause 0.6`` provides a smooth rate across the hour, although a ``--throttle-pause 0.72`` (3600 seconds [1 hour] / 5000 limit) is theoretically safer to prevent large rate-limit pauses. - - -About Git LFS -------------- - -When you use the ``--lfs`` option, you will need to make sure you have Git LFS installed. - -Instructions on how to do this can be found on https://git-lfs.github.com. - - -Run in Docker container ------------------------ - -To run the tool in a Docker container use the following command: - - sudo docker run --rm -v /path/to/backup:/data --name github-backup ghcr.io/josegonzalez/python-github-backup -o /data $OPTIONS $USER - -Gotchas / Known-issues -====================== - -All is not everything ---------------------- - -The ``--all`` argument does not include: cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. - -Cloning all starred size ------------------------- - -Using the ``--all-starred`` argument to clone all starred repositories may use a large amount of storage space, especially if ``--all`` or more arguments are used. e.g. commonly starred repos can have tens of thousands of issues, many large assets and the repo itself etc. Consider just storing links to starred repos in JSON format with ``--starred``. - -Incremental Backup ------------------- - -Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. - -This means any blocking errors on previous runs can cause a large amount of missing data in backups. - -Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. - -Still saver than the previous version. - -Specifically, issues and pull requests are handled like this. - -Known blocking errors ---------------------- - -Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. - -If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. - -It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. - -1. **Starred public repo hooks blocking** - - Since the ``--all`` argument includes ``--hooks``, if you use ``--all`` and ``--all-starred`` together to clone a users starred public repositories, the backup will likely error and block the backup continuing. - - This is due to needing the correct permission for ``--hooks`` on public repos. - - -"bare" is actually "mirror" ---------------------------- - -Using the bare clone argument (``--bare``) will actually call git's ``clone --mirror`` command. There's a subtle difference between `bare `_ and `mirror `_ clone. - -*From git docs "Compared to --bare, --mirror not only maps local branches of the source to local branches of the target, it maps all refs (including remote-tracking branches, notes etc.) and sets up a refspec configuration such that all these refs are overwritten by a git remote update in the target repository."* - - -Starred gists vs starred repo behaviour ---------------------------------------- - -The starred normal repo cloning (``--all-starred``) argument stores starred repos separately to the users own repositories. However, using ``--starred-gists`` will store starred gists within the same directory as the users own gists ``--gists``. Also, all gist repo directory names are IDs not the gist's name. - - -Skip existing on incomplete backups ------------------------------------ - -The ``--skip-existing`` argument will skip a backup if the directory already exists, even if the backup in that directory failed (perhaps due to a blocking error). This may result in unexpected missing data in a regular backup. +**For automated/cron backups**, simply set up the same command in your cron job:: + # Daily backup at 2 AM + 0 2 * * * github-backup YOUR_ORG --app-id $GITHUB_APP_ID --installation-id $GITHUB_INSTALLATION_ID --private-key $GITHUB_PRIVATE_KEY --organization --repositories --output-directory /backup/github Github Backup Examples ====================== @@ -769,87 +308,60 @@ Debug an error/block or incomplete backup into a temporary directory. Omit "incr github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + GitHub App Organization Backup Examples ======================================== Backup entire organization using GitHub App (recommended for organizations):: - export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx # Installation access token - ORGANIZATION=mycompany - - github-backup $ORGANIZATION \ - --token $GITHUB_APP_TOKEN \ - --as-app \ + github-backup mycompany \ + --app-id 123456 \ + --installation-id 789012 \ + --private-key /path/to/app-private-key.pem \ --organization \ - --output-directory /backup/github-org \ - --all \ - --private \ --repositories \ - --wikis \ --issues \ --pulls \ - --issue-comments \ - --pull-comments \ - --labels \ - --milestones + --wikis \ + --output-directory /backup/github-org Incremental organization backup with GitHub App for automated/cron scenarios:: - export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx - ORGANIZATION=mycompany - - github-backup $ORGANIZATION \ - --token $GITHUB_APP_TOKEN \ - --as-app \ + github-backup mycompany \ + --app-id 123456 \ + --installation-id 789012 \ + --private-key /path/to/app-private-key.pem \ --organization \ - --output-directory /backup/github-org \ + --repositories \ --incremental \ - --private \ + --output-directory /backup/github-org + +Backup specific organization repository with comprehensive data using GitHub App:: + + github-backup mycompany \ + --app-id 123456 \ + --installation-id 789012 \ + --private-key /path/to/app-private-key.pem \ + --organization \ + --repository main-project \ --repositories \ - --wikis \ --issues \ --pulls \ + --wikis \ --issue-comments \ --pull-comments \ - --labels \ - --milestones \ - --log-level error - -Backup specific organization repository with comprehensive data using GitHub App:: - - export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx - ORGANIZATION=mycompany - REPO=main-project - - github-backup $ORGANIZATION \ - --token $GITHUB_APP_TOKEN \ - --as-app \ - --organization \ - --repository $REPO \ - --output-directory /backup/specific-repo \ - --all \ - --private \ - --pull-details \ - --releases \ - --assets + --output-directory /backup/github-repo Organization backup excluding certain repositories:: - export GITHUB_APP_TOKEN=ghs_xxxxxxxxxxxxxxxxxxxx - ORGANIZATION=mycompany - - github-backup $ORGANIZATION \ - --token $GITHUB_APP_TOKEN \ - --as-app \ + github-backup mycompany \ + --app-id 123456 \ + --installation-id 789012 \ + --private-key /path/to/app-private-key.pem \ --organization \ - --output-directory /backup/github-org \ - --all \ - --private \ + --repositories \ --exclude repo-to-skip another-repo-to-skip \ - --throttle-limit 4500 \ - --throttle-pause 0.8 - - + --output-directory /backup/github-org Development =========== diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 934db740..fbe78af5 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -27,7 +27,13 @@ from urllib.request import HTTPRedirectHandler, Request, build_opener, urlopen # GitHub App authentication imports -import jwt +try: + import jwt +except ImportError: + raise ImportError( + "PyJWT library is required for GitHub App authentication. " + "Install it with: pip install PyJWT>=2.0.0 cryptography>=3.0.0" + ) try: from . import __version__ diff --git a/requirements.txt b/requirements.txt index 8b137891..7d277623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ - +PyJWT>=2.0.0 +cryptography>=3.0.0 From 39486ef9810b7eb1e6ca8dfdfb1cc66e3eeeb8c6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:58:55 +0000 Subject: [PATCH 7/8] Complete GitHub App integration: simplified approach with automatic token management Co-authored-by: schlomo <101384+schlomo@users.noreply.github.com> --- github_backup/github_backup.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index fbe78af5..21674638 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -867,33 +867,6 @@ def _get_response(request, auth, template, args=None): except HTTPError as exc: errors, should_continue = _request_http_error(exc, auth, errors, args) # noqa r = exc - - # If token was refreshed, we need to reconstruct the request with new auth - if should_continue and args and _github_app_credentials: - new_auth = get_auth(args, encode=not args.as_app) - if new_auth != auth: - # Extract the original URL from the request - original_url = request.get_full_url() - - # Parse URL to get query parameters - parsed_url = urlparse(original_url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - - # Reconstruct the request with new auth - request = Request(original_url) - - # Set the new authorization header - if not args.as_app: - request.add_header("Authorization", f"Basic {new_auth.decode('ascii')}") - else: - if args.token_fine: - request.add_header("Authorization", f"token {new_auth}") - else: - request.add_header("Authorization", f"token {new_auth}") - - request.add_header("User-Agent", f"github-backup/{VERSION}") - auth = new_auth # Update local auth variable - except URLError as e: logger.warning(e.reason) should_continue, retry_timeout = _request_url_error(template, retry_timeout) From 495177918f1c4839dbcf65eca7b2d8bc12b50437 Mon Sep 17 00:00:00 2001 From: Schlomo Schapiro Date: Tue, 2 Sep 2025 06:32:52 +0200 Subject: [PATCH 8/8] Fixes - Reading private key files completely (not just the first line) - Supporting both `file://` prefixed paths and direct file paths - Adding proper error handling for file reading operations - Automatic token regeneration when needed - Proper timezone handling for token expiration comparison - Fixed parameter order bug in `_construct_request()` that caused pagination to start at page 100 - Fixed `b"..."` string issue in `logging_subprocess()` by properly decoding bytes to strings --- activate_dev.sh | 37 +++++++++++++++++++++++++++++++++ github_backup/github_backup.py | 38 +++++++++++++++++++++++----------- 2 files changed, 63 insertions(+), 12 deletions(-) create mode 100755 activate_dev.sh diff --git a/activate_dev.sh b/activate_dev.sh new file mode 100755 index 00000000..31b8c6f8 --- /dev/null +++ b/activate_dev.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Development Environment Activation Script +# This script activates the virtual environment and sets up the development environment + +echo "🐍 Activating python-github-backup development environment..." + +# Check if virtual environment exists +if [ ! -d "venv" ]; then + echo "❌ Virtual environment not found. Please run the setup first:" + echo " python3 -m venv venv" + echo " source venv/bin/activate" + echo " pip install -r requirements.txt" + echo " pip install -r release-requirements.txt" + echo " pip install -e ." + exit 1 +fi + +# Activate virtual environment +echo "✅ Activating virtual environment..." +source venv/bin/activate + +# Check if package is installed +if ! python -c "import github_backup" 2>/dev/null; then + echo "❌ Package not installed in development mode. Installing..." + pip install -e . +fi + +echo "✅ Development environment ready!" +echo "" +echo "Available commands:" +echo " github-backup -h # Show help" +echo " flake8 --ignore=E501 github_backup/ # Run linting" +echo " black --check github_backup/ # Check code formatting" +echo " black github_backup/ # Format code" +echo "" +echo "To deactivate: deactivate" diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 21674638..886bfd24 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -96,7 +96,9 @@ def check_io(): if not logger: continue if not (io == child.stderr and not line): - logger.log(log_level[io], line[:-1]) + # Decode bytes to string for proper logging + line_str = line.decode('utf-8', errors='replace').rstrip('\n') + logger.log(log_level[io], line_str) # keep checking stdout/stderr until the child exits while child.poll() is None: @@ -526,8 +528,9 @@ def get_auth(args, encode=True, for_git_cli=False): logger.warning("GitHub App credentials provided but --as-app not specified. Enabling app authentication.") args.as_app = True - # Store credentials globally for token refresh - _github_app_credentials = (args.app_id, args.installation_id, args.private_key) + # Store credentials globally for token refresh (only if not already set) + if not _github_app_credentials: + _github_app_credentials = (args.app_id, args.installation_id, args.private_key) # Get fresh token token = get_or_refresh_github_app_token() @@ -623,6 +626,10 @@ def generate_github_app_token(app_id, installation_id, private_key): # Load private key if private_key.startswith(FILE_URI_PREFIX): private_key = read_file_contents(private_key) + elif os.path.exists(private_key): + # If it's a file path, convert to file:// format + file_uri = f"{FILE_URI_PREFIX}{private_key}" + private_key = read_file_contents(file_uri) # Create JWT payload now = int(time.time()) @@ -631,7 +638,6 @@ def generate_github_app_token(app_id, installation_id, private_key): "exp": now + 600, # Expires in 10 minutes (max allowed) "iss": int(app_id) # Issuer (GitHub App ID) } - # Generate JWT jwt_token = jwt.encode(payload, private_key, algorithm="RS256") @@ -668,16 +674,24 @@ def get_or_refresh_github_app_token(): app_id, installation_id, private_key = _github_app_credentials - # Check if we need a new token (5 minutes buffer before expiry) - now = datetime.now().replace(tzinfo=None) + # Simple approach: Check if token exists and is not expired (with 5-minute buffer) + # Convert both times to UTC for comparison (GitHub API returns UTC times) + now_utc = datetime.utcnow() + expires_utc = _github_app_token_expires.replace(tzinfo=None) if _github_app_token_expires else None + + # Generate new token if: + # 1. No token exists + # 2. Token is expired or will expire within 5 minutes if (_github_app_token is None or - _github_app_token_expires is None or - now >= (_github_app_token_expires.replace(tzinfo=None) - timedelta(minutes=5))): + expires_utc is None or + now_utc >= (expires_utc - timedelta(minutes=5))): logger.info("Generating new GitHub App token...") _github_app_token, _github_app_token_expires = generate_github_app_token( app_id, installation_id, private_key ) + else: + logger.debug(f"Using cached token, expires at: {_github_app_token_expires}") return _github_app_token @@ -701,7 +715,7 @@ def get_github_host(args): def read_file_contents(file_uri): - return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip() + return open(file_uri[len(FILE_URI_PREFIX) :], "rt").read() def get_github_repo_url(args, repository): @@ -748,13 +762,13 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False): else: page = page + 1 request_page, request_per_page = page, per_page - - # Get fresh auth on each request to handle token refresh + + # Always get fresh auth before each API call - caching handles optimization auth = get_auth(args, encode=not args.as_app) request = _construct_request( - request_page, request_per_page, + request_page, query_args, template, auth,