**Crawl Repositories from GitHub using the GitHub API**

This notebook crawls the top 10k repositories from GitHub using the GitHub API. 
As the the GitHub API only return the first 1000 results, we followed a naive approach to avoid this limit by setting ranges of stars to crawl.
For each range of stars, we ensured to never hit the first 1000 results. 
We stopped when we reached 10k repositories.

In [None]:
import requests
import time
import os
import json
import pandas as pd
from dotenv import load_dotenv
import os

# load env variables
load_dotenv(dotenv_path="../.env")

# base url
BASE_URL = "https://api.github.com/search/repositories"

TOKEN = os.getenv("GITHUB_API_TOKEN")

# Set up headers with authentication token
headers = {'Authorization': f'token {TOKEN}'}

# Function to fetch repositories for a given star range
def fetch_repositories_by_stars(min_stars, max_stars):
    repositories = []
    page = 1
    per_page = 100  # Maximum allowed per page

    while True:
        params = {
            'q': f'stars:{min_stars}..{max_stars}',
            'sort': 'stars',
            'order': 'desc',
            'per_page': per_page,
            'page': page
        }

        response = requests.get(BASE_URL, headers=headers, params=params)

        # Check for errors
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.json()}")
            break

        data = response.json()
        repos = data.get('items', [])
        repositories.extend(repos)

        if len(repos) == 0 or len(repositories) >= 10000:
            break  # Stop if no more repositories are returned or we hit the 10,000 limit

        print(f"Fetched {len(repositories)} repositories with {min_stars}..{max_stars} stars...")

        page += 1

        # Respect GitHub's rate limits by sleeping for a short time
        time.sleep(2)

    return repositories

# Function to fetch the top 10,000 repositories by splitting star ranges
def fetch_top_repositories(repo_limit):
    repositories = []
    star_ranges = [
        (400000, 500000), 
        (300000, 400000), 
        (200000, 300000), 
        (100000, 200000), 
        (90000, 100000),  
        (80000, 90000),   
        (70000, 80000),   
        (60000, 70000),   
        (50000, 60000),   
        (40000, 50000),   
        (30000, 40000),  
        (25000, 30000),   
        (20000, 25000),
        (15000, 20000),
        (12500, 15000),   
        (12000, 12500),
        (11500, 12000),
        (11000, 11500),
        (10500, 11000),
        (10000, 10500),
        (9500, 10000), 
        (9000, 9500),
        (8500, 9000),
        (8000, 8500),
        (7500, 8000),
        (7000, 7500),
        (6500, 7000),
        (6000, 6500),
        (5500, 6000),
        (5400, 5500),
        (5400, 5500),
        (5300, 5400),
        (5200, 5300),
        (5100, 5200),
        (5000, 5100),
        (4900, 5000),
        (4800, 4900),
        (4700, 4800),
        (4600, 4700),
        (4500, 4600),
        (4400, 4500),
        (4300, 4400),
        (4200, 4300),
        (4100, 4200),
        (4000, 4100),
        (3900, 4000),
        (3800, 3900),
        (3700, 3800),
        (3600, 3700),
        (3500, 3600),
        (3400, 3500),
        (3300, 3400),
        (3200, 3300),
        (3100, 3200),
        (3000, 3100),
    ]

    for min_stars, max_stars in star_ranges:
        if len(repositories) >= repo_limit:
            break

        # Fetch repositories within the star range
        repos = fetch_repositories_by_stars(min_stars, max_stars)
        repositories.extend(repos)

        # Stop once we've hit 10,000 repositories
        if len(repositories) >= repo_limit:
            break

    return repositories[:repo_limit]

repo_limit = 2000
repositories = fetch_top_repositories(repo_limit)

In [11]:
from datetime import datetime, timedelta

ALLOWED_FIELDS = [
    "name", "full_name", "html_url", "description", "created_at", "updated_at",
    "size", "stargazers_count", "language", "topics", "default_branch", "archived"
]

filtered_repositories = [
    {field: repo.get(field, None) for field in ALLOWED_FIELDS}
    for repo in repositories
]

df_projects = pd.DataFrame(filtered_repositories)

df_projects['updated_at'] = pd.to_datetime(df_projects['updated_at'], errors='coerce').dt.tz_localize(None)

three_months_ago = datetime.now() - timedelta(days=30)

df_projects['Updated_in_last_30_days'] = df_projects['updated_at'].apply(
    lambda x: 'yes' if pd.notnull(x) and x >= three_months_ago else 'no'
)

df_projects.to_csv("../data/projects.csv", index=False)

In [12]:
# Define rule-based classifier for real-world status
def exclude_repos(row):
    name = str(row['name']).lower()
    desc = str(row['description']).lower()
    size_kb = row['size']

    # Exclusion keywords
    exclusion_keywords = [
        'awesome', 'list', 'tutorial', 'learn', 'book', 'guide', 'exercise',
        'course', 'interview', 'template', 'starter', 'demo', 'example',
        'algorithm', 'data-structure', "cheatsheet"
    ]

    # Exclude if keyword is in name, description, or topics
    for keyword in exclusion_keywords:
        if keyword in name or keyword in desc:
            return 'n'

    # Positive signal: substantial codebase size
    if size_kb >= 1000:
        return 'y'

    return 'n'

# Apply real-world classification
df_projects['relevance'] = df_projects.apply(exclude_repos, axis=1)

df_projects.to_csv("../data/projects.csv", index=False)


In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Load the project CSV file
df = pd.read_csv("projects.csv")

# Convert 'updated_at' to datetime and make timezone-naive
df['updated_at'] = pd.to_datetime(df['updated_at'], errors='coerce').dt.tz_localize(None)

# Calculate threshold for last 3 months
three_months_ago = datetime.now() - timedelta(days=90)

# Relevance annotation: recently updated = relevant
df['relevance'] = df['updated_at'].apply(
    lambda x: 'relevant' if pd.notnull(x) and x >= three_months_ago else 'not relevant'
)

# Define rule-based classifier for real-world status
def classify_real_world_project(row):
    name = str(row['name']).lower()
    desc = str(row['description']).lower()
    try:
        topics = [t.lower() for t in eval(row['topics'])] if pd.notnull(row['topics']) else []
    except Exception:
        topics = []
    size_kb = row['size']

    # Exclusion keywords
    exclusion_keywords = [
        'awesome', 'list', 'tutorial', 'learn', 'book', 'guide', 'exercise',
        'course', 'interview', 'template', 'starter', 'demo', 'example',
        'algorithm', 'data-structure'
    ]

    # Exclude if keyword is in name, description, or topics
    for keyword in exclusion_keywords:
        if keyword in name or keyword in desc or any(keyword in topic for topic in topics):
            return 'not real-world'

    # Positive signal: substantial codebase size
    if size_kb >= 1000:
        return 'real-world'

    return 'uncertain'

# Apply real-world classification
df['real_world_status'] = df.apply(classify_real_world_project, axis=1)

# Save annotated file
df.to_csv("projects_annotated.csv", index=False)

In [None]:
import json

# Load the large JSON file
with open('../data/projects_raw.json', 'r') as f:
    data = json.load(f)

# Calculate split size
num_splits = 10
split_size = len(data) // num_splits

# Write each split to a new JSON file
for i in range(num_splits):
    start = i * split_size
    # Ensure last split gets remaining items
    end = (i + 1) * split_size if i < num_splits - 1 else len(data)
    split_data = data[start:end]
    
    with open(f'../data/projects_raw_{i+1}.json', 'w') as f:
        json.dump(split_data, f, indent=2)

print("JSON file successfully split into 5 smaller files.")

**Crawl Repositories from GitHub using the GitHub API**

In [None]:
import requests
import os
from dotenv import load_dotenv
import json
import time

# load env variables
load_dotenv(dotenv_path="../.env")

GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN")
GITHUB_API_URL = "https://api.github.com/search/repositories"

HEADERS = {
    "Authorization": f"token {GITHUB_API_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

MAX_RESULTS = 300  # Stop after finding 100 matching repositories
PER_PAGE = 30  # Number of repos per page (max 100)
MATCHING_REPOS = []  # Store valid repositories


def search_repositories(page):
    """Search for Java repositories mentioning Spring Boot (paginated)"""
    search_url = "https://api.github.com/search/repositories"
    query = 'spring-boot language:Java'

    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": PER_PAGE,
        "page": page
    }

    response = requests.get(search_url, headers=HEADERS, params=params)
    
    if response.status_code == 200:
        return response.json().get("items", [])
    else:
        print(f"Error: {response.status_code}, Message: {response.json()}")
        return []


def process_repositories(repositories):
    """Check repositories for Maven, Docker, and Docker Compose files"""
    global MATCHING_REPOS

    for repo in repositories:
        if len(MATCHING_REPOS) >= MAX_RESULTS:
            return  # Stop if we already found enough

        repo_name = repo["full_name"]
        default_branch = repo["default_branch"]

        file_tree = get_repo_file_tree(repo_name, default_branch)

        # Check if required files exist anywhere in the repo
        has_maven = any("pom.xml" in file for file in file_tree)
        has_spring = any("application.properties" in file for file in file_tree) or any("application.yml" in file for file in file_tree)
        has_docker = any("Dockerfile" in file for file in file_tree)
        has_compose = any("docker-compose.yml" in file for file in file_tree)

        if has_maven and has_spring and (has_docker or has_compose):

            print(f"⭐ {repo['stargazers_count']} | {repo_name} | {repo['html_url']}")
            MATCHING_REPOS.append(repo)
        else:
            print(f"❌ {repo_name} does not have the required files.")



def get_repo_file_tree(repo_full_name, default_branch):
    """Retrieve the full file tree of a repository"""
    tree_url = f"https://api.github.com/repos/{repo_full_name}/git/trees/{default_branch}?recursive=1"
    
    response = requests.get(tree_url, headers=HEADERS)
    if response.status_code == 200:
        return [file["path"] for file in response.json().get("tree", [])]
    return []


page = 1
while len(MATCHING_REPOS) < MAX_RESULTS:
    print(f"Fetching page {page}...")

    repositories = search_repositories(page)
    if not repositories:
        break  # Stop if no more results

    process_repositories(repositories)

    page += 1
    time.sleep(2)  # Add a delay to avoid hitting rate limits

# Save the matching repositories to a JSON file
with open(f"../data/micro_service_projects.json", "w") as dest:
    json.dump(MATCHING_REPOS, dest, indent=2)

print(f"\n✅ Found {len(MATCHING_REPOS)} repositories matching all conditions.")



In [2]:
import pandas as pd
import json

# Load JSON data
with open("../data/microservice_projects.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Normalize and filter specific fields
df = pd.json_normalize(data)[["stargazers_count", "name", "full_name", "description", "html_url"]]

# Rename columns for clarity
df.rename(columns={
    "stargazers_count": "stars",
    "html_url": "url"
}, inplace=True)

# Save to CSV
csv_path = "../data/microservice_projects.csv"
df.to_csv(csv_path, index=False)



In [None]:
import glob
import json

files = [file_name for file_name in glob.glob("../data/microservice_projects/*.json")]

len(files)


with open("../slurm/test.txt", "w", encoding="utf-8") as dest:
    for file_name in files:
        try:
            with open(file_name, "r", encoding="utf-8") as src:
                repo = json.load(src)
                dest.write(f"python3 analyze.py --url={repo['html_url']} --name={repo['name']}" + "\n")
        except:
            print("Skipping file", file_name)
            continue

**Crawl top 200 Repositories from GitHub using the GitHub API**

- last scraped at: 22.04.2025

In [None]:
import requests
import json
import os
import csv
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv(dotenv_path="../.env")

# GitHub API token and base URL
GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN")
BASE_URL = "https://api.github.com/search/repositories"

# Headers for authentication
HEADERS = {
    "Authorization": f"token {GITHUB_API_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Function to fetch repositories
def fetch_top_repositories(limit=200):
    repositories = []
    page = 1
    per_page = 100  # Maximum allowed per page

    while len(repositories) < limit:
        params = {
            "q": "stars:>0",
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }

        response = requests.get(BASE_URL, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.json()}")
            break

        data = response.json()
        repos = data.get("items", [])
        repositories.extend(repos)

        if len(repos) == 0:
            break  # Stop if no more repositories are returned

        print(f"Fetched {len(repositories)} repositories so far...")
        page += 1

        # Respect GitHub's rate limits
        time.sleep(2)

    return repositories[:limit]

# Fetch the top 200 repositories
top_repositories = fetch_top_repositories(limit=200)

# Save the results to a CSV file
output_csv_file = "../data/popularity_projects.csv"
os.makedirs(os.path.dirname(output_csv_file), exist_ok=True)

with open(output_csv_file, "w", encoding="utf-8", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write the header
    csvwriter.writerow(["stars", "name", "full_name", "html_url", "description"])
    # Write the repository data
    for repo in top_repositories:
        csvwriter.writerow([repo.get("stargazers_count"), repo.get("name"), repo.get("full_name"), repo.get("html_url"), repo.get("description")])

print(f"✅ Successfully saved {len(top_repositories)} repositories to {output_csv_file}.")

In [None]:
import requests
import json
import os
import csv
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv(dotenv_path="../.env")

# GitHub API token and base URL
GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN")
BASE_URL = "https://api.github.com/search/repositories"

# Headers for authentication
HEADERS = {
    "Authorization": f"token {GITHUB_API_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Keywords to exclude toy projects, courses, and tutorials
EXCLUDE_KEYWORDS = ["tutorial", "course", "example", "demo", "test", "sample"]

# Function to fetch repositories for a given star range
def fetch_repositories_by_stars(min_stars, max_stars, limit=100):
    repositories = []
    page = 1
    per_page = 30  # Maximum allowed per page

    while len(repositories) < limit:
        params = {
            "q": f"stars:{min_stars}..{max_stars}",
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }

        response = requests.get(BASE_URL, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.json()}")
            break

        data = response.json()
        repos = data.get("items", [])

        # Filter out toy projects, courses, and tutorials
        for repo in repos:
            if len(repositories) >= limit:
                break

            repositories.append(repo)

        if len(repos) == 0:
            break  # Stop if no more repositories are returned

        print(f"Fetched {len(repositories)} repositories so far...")
        page += 1

        # Respect GitHub's rate limits
        time.sleep(2)

    return repositories[:limit]

# Fetch top 100 most popular, medium popular, and low popular projects
top_projects = fetch_repositories_by_stars(50000, 500000, limit=250)
medium_projects = fetch_repositories_by_stars(5000, 50000, limit=250)
low_projects = fetch_repositories_by_stars(0, 5000, limit=250)

# Save the results to CSV files
os.makedirs("../data", exist_ok=True)

def save_to_csv(filename, projects):
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write the header
        csvwriter.writerow(["stars", "name", "full_name", "html_url", "description"])
        # Write the repository data
        for repo in projects:
            csvwriter.writerow([
                repo.get("stargazers_count"),
                repo.get("name"),
                repo.get("full_name"),
                repo.get("html_url"),
                repo.get("description")
            ])

save_to_csv("../data/top_projects.csv", top_projects)
save_to_csv("../data/medium_projects.csv", medium_projects)
save_to_csv("../data/low_projects.csv", low_projects)

print("✅ Successfully saved projects to CSV files.")