In [6]:
# Random Python Repository Sampler using GHArchive
#
# This notebook implements a method to get random Python repositories from GitHub
# by sampling random hours from GHArchive data over the past several years.

import os
import io
import json
import gzip
import random
import requests
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import time

# Configuration parameters
n_samples = 10  # Number of hour samples to take
years_back = 10  # How many years back to sample from
n_repos_per_hour = 10  # Number of repositories to sample per hour

# Create temp directory for downloads
os.makedirs("temp", exist_ok=True)

# Function to pick a random hour over the last Y years
def get_random_hour(years_back=10):
    """
    Generate a random date and hour within the specified number of years back.
    
    Args:
        years_back: Number of years to look back
        
    Returns:
        Tuple of (date_str, hour)
    """
    now = datetime.now()
    start_date = now - timedelta(days=365 * years_back)
    
    # Random number of days between start_date and now
    days_range = (now - start_date).days
    random_days = random.randint(0, days_range)
    
    # Generate the random date
    random_date = start_date + timedelta(days=random_days)
    date_str = random_date.strftime("%Y-%m-%d")
    
    # Random hour (0-23)
    hour = random.randint(0, 23)
    
    return date_str, hour

# Function to get GHArchive URL for a specific date and hour
def get_gharchive_url(date_str, hour):
    """Generate the URL for a GHArchive file"""
    return f"https://data.gharchive.org/{date_str}-{hour}.json.gz"

# Function to download a GHArchive file
def download_gharchive_file(url, save_path):
    """
    Download a GHArchive file
    
    Args:
        url: URL to download
        save_path: Where to save the file
        
    Returns:
        Path to the downloaded file or None if failed
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                
        return save_path
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

# Function to sample repositories from a GHArchive file
def sample_repos_from_file(file_path, n_repos=10):
    """
    Sample random repositories from a GHArchive file
    
    Args:
        file_path: Path to the GHArchive file
        n_repos: Number of repositories to sample
        
    Returns:
        List of repository dictionaries
    """
    repos = []
    seen_repos = set()
    
    try:
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            # Read the file line by line
            for line in f:
                try:
                    event = json.loads(line)
                    
                    # Extract repository information if present
                    if 'repo' in event and 'name' in event['repo']:
                        repo_name = event['repo']['name']
                        
                        # Skip if we've already seen this repo
                        if repo_name in seen_repos:
                            continue
                            
                        seen_repos.add(repo_name)
                        
                        repos.append({
                            'name': repo_name,
                            'url': f"https://github.com/{repo_name}",
                            'event_type': event.get('type', 'Unknown'),
                            'timestamp': event.get('created_at', '')
                        })
                        
                        # If we have enough repos, stop
                        if len(repos) >= n_repos:
                            break
                except json.JSONDecodeError:
                    continue
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    
    # If we have more repos than needed, take a random sample
    if len(repos) > n_repos:
        repos = random.sample(repos, n_repos)
        
    return repos

# Function to check if a repository uses Python
def check_repo_language(repo_name):
    """
    Check if a repository uses Python as its main language
    
    Args:
        repo_name: Repository name (owner/repo)
        
    Returns:
        Dictionary with repository information or None if error
    """
    try:
        api_url = f"https://api.github.com/repos/{repo_name}"
        response = requests.get(api_url)
        
        if response.status_code == 200:
            repo_data = response.json()
            
            return {
                'name': repo_name,
                'url': f"https://github.com/{repo_name}",
                'language': repo_data.get('language'),
                'stars': repo_data.get('stargazers_count', 0),
                'forks': repo_data.get('forks_count', 0),
                'created_at': repo_data.get('created_at', ''),
                'updated_at': repo_data.get('updated_at', ''),
                'is_python': repo_data.get('language') == 'Python'
            }
        elif response.status_code == 404:
            print(f"Repository {repo_name} not found")
            return None
        else:
            print(f"Error checking {repo_name}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Exception checking {repo_name}: {e}")
        return None

# Main execution
print(f"Sampling {n_samples} random hours from GHArchive over the past {years_back} years")
print(f"Will select up to {n_repos_per_hour} repositories per hour")

# Store all sampled repositories
all_repos = []

# Sample multiple random hours
for i in range(n_samples):
    # Get random date and hour
    date_str, hour = get_random_hour(years_back)
    print(f"\nSample {i+1}/{n_samples}: {date_str}, hour {hour}")
    
    # Construct URL
    url = get_gharchive_url(date_str, hour)
    print(f"URL: {url}")
    
    # Create filename for the downloaded file
    filename = f"gharchive_{date_str}_{hour}.json.gz"
    file_path = os.path.join("temp", filename)
    
    # Download the file
    print(f"Downloading file...")
    downloaded_file = download_gharchive_file(url, file_path)
    
    if downloaded_file:
        # Sample repositories from the file
        print(f"Sampling repositories...")
        repos = sample_repos_from_file(downloaded_file, n_repos_per_hour)
        print(f"Found {len(repos)} repositories")
        
        # Add sample info to each repo
        for repo in repos:
            repo['sample_date'] = date_str
            repo['sample_hour'] = hour
        
        all_repos.extend(repos)
        
        # Delete the file to save space
        os.remove(downloaded_file)
        print(f"Deleted temporary file")
    else:
        print(f"Failed to download file for {date_str}, hour {hour}")

print(f"\nSampled a total of {len(all_repos)} repositories")

# Convert to DataFrame
repos_df = pd.DataFrame(all_repos)
print(f"Created DataFrame with shape {repos_df.shape}")

# Check language for each repository
print("\nChecking language for each repository...")
language_info = []

for i, repo in enumerate(tqdm(repos_df.itertuples(), total=len(repos_df))):
    # Add a small delay to avoid rate limiting
    if i > 0 and i % 10 == 0:
        time.sleep(2)
        
    info = check_repo_language(repo.name)
    if info:
        language_info.append(info)

# Convert language info to DataFrame
language_df = pd.DataFrame(language_info)
print(f"Got language information for {len(language_df)} repositories")

# Filter for Python repositories
python_repos = language_df[language_df['is_python'] == True].copy()
print(f"Found {len(python_repos)} Python repositories")

# Save to JSONL
output_file = "python_repos_from_gharchive.jsonl"
with open(output_file, 'w') as f:
    for _, repo in python_repos.iterrows():
        f.write(json.dumps(repo.to_dict()) + '\n')

print(f"\nSaved {len(python_repos)} Python repositories to {output_file}")

# Display sample of Python repositories
print("\nSample of Python repositories found:")
display(python_repos[['name', 'stars', 'language', 'url']].head(10))

# Optional: Create a CSV version too
python_repos.to_csv("python_repos_from_gharchive.csv", index=False)
print(f"Also saved results to python_repos_from_gharchive.csv")

Sampling 10 random hours from GHArchive over the past 10 years
Will select up to 10 repositories per hour

Sample 1/10: 2018-09-05, hour 11
URL: https://data.gharchive.org/2018-09-05-11.json.gz
Downloading file...
Sampling repositories...
Found 10 repositories
Deleted temporary file

Sample 2/10: 2022-02-23, hour 8
URL: https://data.gharchive.org/2022-02-23-8.json.gz
Downloading file...
Sampling repositories...
Found 10 repositories
Deleted temporary file

Sample 3/10: 2025-02-02, hour 2
URL: https://data.gharchive.org/2025-02-02-2.json.gz
Downloading file...
Sampling repositories...
Found 10 repositories
Deleted temporary file

Sample 4/10: 2020-07-22, hour 2
URL: https://data.gharchive.org/2020-07-22-2.json.gz
Downloading file...
Sampling repositories...
Found 10 repositories
Deleted temporary file

Sample 5/10: 2023-02-19, hour 9
URL: https://data.gharchive.org/2023-02-19-9.json.gz
Downloading file...
Sampling repositories...
Found 10 repositories
Deleted temporary file

Sample 6/10

  0%|          | 0/100 [00:00<?, ?it/s]

Repository macjohnny/openapi-generator-1 not found
Repository Dborkowski/SecondHandCars4You not found
Repository Logicye/CyberXSecurity-Project-1 not found
Repository ukyohpq/PythonMLB not found
Repository NeofetchNpc/update-repo not found
Error checking Micraow/AutoApiSecret: 403
Repository MatthewTe/velkoz-data-warehouse-application not found
Repository ratnasute/ratnasute.github.io not found
Repository robced/onibot not found
Repository 01PhiHung01/LAB2_CNTT2 not found
Repository zsxs2/zs not found
Repository Madogiwa0124/dogfeeds not found
Repository Eduardo-Gonz/todo-app not found
Repository dmnapolitano/tate_lang not found
Repository yuhaocan/Flutter_Clock not found
Repository adsviewer/turboviewer not found
Repository daisuke721/yasai_pocket not found
Repository zootyducky/KU2024_CGIP_FABRIK_WEB_DEMO not found
Repository atomist-test-web/project_hwnxspjlsrl0m not found
Repository chronos280/Fork-me not found
Repository sust-code/json-pretty-printer-flta not found
Repository obry

Unnamed: 0,name,stars,language,url
0,spreaker/prometheus-pgbouncer-exporter,107,Python,https://github.com/spreaker/prometheus-pgbounc...
8,thp/urlwatch,2891,Python,https://github.com/thp/urlwatch
24,ZhouZ-1/github-stats,3,Python,https://github.com/ZhouZ-1/github-stats
48,MatthewNavy/MinimaxAB,0,Python,https://github.com/MatthewNavy/MinimaxAB


Also saved results to python_repos_from_gharchive.csv


In [7]:
# Random Python Repository Sampling via GitHub API
#
# This notebook implements a method to get random Python repositories from GitHub
# by sampling random hours across the past decade.

import requests
import random
import json
import time
import os
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import pandas as pd
from IPython.display import display, HTML

# Configuration parameters
N_SAMPLES = 10  # Number of random hours to sample
REPOS_PER_HOUR = 10  # Number of repositories to fetch per hour
YEARS_BACK = 10  # How many years back to sample from
OUTPUT_FILE = "python_repos_sample.jsonl"  # Where to save the results

# Optional: GitHub API token (set this to avoid rate limits)
# To use this, create a .env file with GITHUB_TOKEN=your_token_here
# or export GITHUB_TOKEN as an environment variable
github_token = "" #os.environ.get("GITHUB_TOKEN", "")

# Function to generate a random date and hour from the past X years
def get_random_date_hour(years_back=10):
    """Generate a random date and hour from the past X years"""
    now = datetime.now()
    start_date = now - timedelta(days=365 * years_back)
    random_days = random.randint(0, (now - start_date).days)
    random_date = start_date + timedelta(days=random_days)
    random_hour = random.randint(0, 23)
    
    return random_date.replace(hour=random_hour, minute=0, second=0, microsecond=0)

# Function to get repositories updated in a specific hour
def get_repos_from_hour(date_hour, language="Python", per_page=10, token=""):
    """
    Get repositories updated in a specific hour
    
    Args:
        date_hour: Datetime object representing the hour to sample
        language: Programming language to filter for
        per_page: Maximum number of repositories to return
        token: GitHub API token (optional)
        
    Returns:
        List of repository dictionaries
    """
    # Format timestamps for GitHub Search API
    start_time = date_hour.isoformat() + "Z"  # GitHub needs the Z suffix for UTC
    end_time = (date_hour + timedelta(hours=1)).isoformat() + "Z"
    
    # Construct GitHub Search API query
    query = f"language:{language} pushed:{start_time}..{end_time}"
    
    # Prepare request headers
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token:
        headers["Authorization"] = f"token {token}"
    
    # Make API request
    try:
        response = requests.get(
            "https://api.github.com/search/repositories",
            params={
                "q": query,
                "sort": "updated",
                "order": "desc",
                "per_page": per_page
            },
            headers=headers
        )
        
        response.raise_for_status()  # Raise exception for 4XX/5XX responses
        
        data = response.json()
        
        # Check if we got any results
        if "items" in data:
            repos = data["items"]
            
            # Extract relevant fields to keep data size manageable
            cleaned_repos = []
            for repo in repos:
                cleaned_repos.append({
                    "name": repo.get("full_name"),
                    "url": repo.get("html_url"),
                    "description": repo.get("description"),
                    "language": repo.get("language"),
                    "stars": repo.get("stargazers_count", 0),
                    "forks": repo.get("forks_count", 0),
                    "created_at": repo.get("created_at"),
                    "updated_at": repo.get("updated_at"),
                    "sample_datetime": start_time
                })
            
            return cleaned_repos
        else:
            print(f"No results found for {start_time}")
            return []
            
    except requests.exceptions.RequestException as e:
        print(f"Error querying GitHub API: {e}")
        
        # Check for rate limiting
        if response.status_code == 403 and "rate limit exceeded" in response.text.lower():
            reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
            current_time = int(time.time())
            wait_time = max(reset_time - current_time, 0)
            
            print(f"Rate limit exceeded. Resets in {wait_time} seconds")
            
            # If wait time is reasonable, wait and retry
            if wait_time < 300:  # 5 minutes max wait
                print(f"Waiting {wait_time} seconds to retry...")
                time.sleep(wait_time + 1)
                return get_repos_from_hour(date_hour, language, per_page, token)
        
        return []

# Check GitHub API rate limits
def check_rate_limits(token=""):
    """Check current GitHub API rate limits"""
    headers = {}
    if token:
        headers["Authorization"] = f"token {token}"
    
    try:
        response = requests.get(
            "https://api.github.com/rate_limit",
            headers=headers
        )
        
        if response.status_code == 200:
            data = response.json()
            search_limit = data.get("resources", {}).get("search", {})
            
            remaining = search_limit.get("remaining", 0)
            limit = search_limit.get("limit", 0)
            reset_time = search_limit.get("reset", 0)
            
            reset_datetime = datetime.fromtimestamp(reset_time)
            
            print(f"GitHub API Search Rate Limit: {remaining}/{limit}")
            print(f"Rate limit resets at: {reset_datetime}")
            
            return remaining
        else:
            print(f"Error checking rate limits: {response.status_code}")
            return 0
    except Exception as e:
        print(f"Error checking rate limits: {e}")
        return 0

# Main function to sample repositories
def sample_random_python_repos(n_samples=10, repos_per_hour=10, years_back=10, token=""):
    """
    Sample random Python repositories by randomly selecting hours
    
    Args:
        n_samples: Number of random hours to sample
        repos_per_hour: Number of repositories to fetch per hour
        years_back: How many years back to sample from
        token: GitHub API token (optional)
        
    Returns:
        DataFrame of sampled repositories
    """
    # Check rate limits first
    remaining_calls = check_rate_limits(token)
    if remaining_calls < n_samples:
        print(f"Warning: You only have {remaining_calls} API calls remaining, but requesting {n_samples} samples")
        print("Consider using a GitHub token or reducing n_samples")
    
    # Generate random hours
    random_hours = [get_random_date_hour(years_back) for _ in range(n_samples)]
    random_hours.sort()  # Sort chronologically for nicer output
    
    print(f"\nSampling from {n_samples} random hours across {years_back} years:")
    for i, hour in enumerate(random_hours):
        print(f"  {i+1}. {hour.strftime('%Y-%m-%d %H:00')}")
    
    # Sample repositories from each random hour
    all_repos = []
    
    print(f"\nFetching up to {repos_per_hour} Python repositories from each hour:")
    for hour in tqdm(random_hours):
        # Format for display
        hour_str = hour.strftime("%Y-%m-%d %H:00")
        
        # Get repositories for this hour
        repos = get_repos_from_hour(hour, "Python", repos_per_hour, token)
        
        if repos:
            print(f"Found {len(repos)} Python repositories for {hour_str}")
            all_repos.extend(repos)
        else:
            print(f"No repositories found for {hour_str}")
        
        # Add a small delay to avoid hitting rate limits
        time.sleep(1)
    
    # Convert to DataFrame
    df = pd.DataFrame(all_repos)
    
    # Return the DataFrame
    return df

# Run the sampling
print("Starting random Python repository sampling...")
repos_df = sample_random_python_repos(
    n_samples=N_SAMPLES,
    repos_per_hour=REPOS_PER_HOUR,
    years_back=YEARS_BACK,
    token=github_token
)

# Display summary
print(f"\nFound {len(repos_df)} Python repositories in total")

# Display sample of repositories
display(HTML("<h3>Sample of Python Repositories</h3>"))
if len(repos_df) > 0:
    display(repos_df[['name', 'stars', 'language', 'url']].head(10))
else:
    print("No repositories found.")

# Save to JSONL file
if len(repos_df) > 0:
    repos_df.to_json(OUTPUT_FILE, orient='records', lines=True)
    print(f"\nSaved {len(repos_df)} repositories to {OUTPUT_FILE}")

    # Also save a CSV for easier viewing
    csv_file = OUTPUT_FILE.replace('.jsonl', '.csv')
    repos_df.to_csv(csv_file, index=False)
    print(f"Also saved to {csv_file}")

Starting random Python repository sampling...
GitHub API Search Rate Limit: 30/30
Rate limit resets at: 2025-03-24 14:49:28

Sampling from 10 random hours across 10 years:
  1. 2017-02-13 09:00
  2. 2019-05-02 19:00
  3. 2019-12-21 08:00
  4. 2020-07-17 04:00
  5. 2021-04-29 15:00
  6. 2021-10-04 01:00
  7. 2021-12-31 11:00
  8. 2024-09-14 17:00
  9. 2024-10-29 09:00
  10. 2025-03-24 04:00

Fetching up to 10 Python repositories from each hour:


  0%|          | 0/10 [00:00<?, ?it/s]

Found 10 Python repositories for 2017-02-13 09:00
Found 10 Python repositories for 2019-05-02 19:00
Found 10 Python repositories for 2019-12-21 08:00
Found 10 Python repositories for 2020-07-17 04:00
Found 10 Python repositories for 2021-04-29 15:00
Found 10 Python repositories for 2021-10-04 01:00
Found 10 Python repositories for 2021-12-31 11:00
Found 10 Python repositories for 2024-09-14 17:00
Found 10 Python repositories for 2024-10-29 09:00
Found 10 Python repositories for 2025-03-24 04:00

Found 100 Python repositories in total


Unnamed: 0,name,stars,language,url
0,hideki-saito/FSA,3,Python,https://github.com/hideki-saito/FSA
1,AKor03/test_task,0,Python,https://github.com/AKor03/test_task
2,ace12358/WordBreaker,0,Python,https://github.com/ace12358/WordBreaker
3,yhgnice/webonline,0,Python,https://github.com/yhgnice/webonline
4,freshpie/python,0,Python,https://github.com/freshpie/python
5,lawrenceleejr/PlottingTools,2,Python,https://github.com/lawrenceleejr/PlottingTools
6,gtomek123/deckrep,0,Python,https://github.com/gtomek123/deckrep
7,NotiflyDev/Python,0,Python,https://github.com/NotiflyDev/Python
8,runt18/poseidon,0,Python,https://github.com/runt18/poseidon
9,calvinYe/my_dj18study,0,Python,https://github.com/calvinYe/my_dj18study



Saved 100 repositories to python_repos_sample.jsonl
Also saved to python_repos_sample.csv
