In [None]:
import requests
import time
import pandas as pd
import os


# base url
BASE_URL = "https://api.github.com/search/repositories"

# GitHub personal access token (replace with your token)
TOKEN = os.getenv("GITHUB_TOKEN")

# Set up headers with authentication token
headers = {
    'Authorization': f'token {TOKEN}'
}

# Function to fetch repositories with retry logic
def fetch_repositories():
    repositories = []
    page = 11
    per_page = 100  # Maximum allowed per page
    retries = 0
    max_retries = 5
    backoff_factor = 2  # Exponential backoff

    while len(repositories) < 10000:
        params = {
            'q': 'stars:>0',  # Filter to get repos with at least 1 star
            'sort': 'stars',
            'order': 'desc',
            'per_page': per_page,
            'page': page
        }

        try:
            response = requests.get(BASE_URL, headers=headers, params=params)

            # Check for secondary rate limit
            if response.status_code == 403 and "secondary rate limit" in response.text.lower():
                print("Hit secondary rate limit. Waiting before retrying...")
                retries += 1
                if retries > max_retries:
                    print("Max retries reached. Exiting.")
                    break
                wait_time = backoff_factor ** retries
                print(f"Retrying after {wait_time} seconds...")
                time.sleep(wait_time)
                continue

            # Check for other errors
            if response.status_code != 200:
                print(f"Error: {response.status_code}, {response.json()}")
                break

            # Reset retries if successful
            retries = 0

            data = response.json()
            repos = data.get('items', [])
            repositories.extend(repos)

            if len(repos) == 0:
                break  # Stop if no more repositories are returned

            print(f"Fetched {len(repositories)} repositories so far on page {page} ... ")

            page += 1

            # Respect GitHub's rate limits by sleeping for a short time
            time.sleep(2)

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            break

    return repositories[:10000]

# Fetch the top 10,000 repositories
top_repositories = fetch_repositories()

names = []
full_names = []
urls = []
stars = []
descriptions = []

# Output the top repositories
for repo in top_repositories:
    names.append(repo['name'])
    full_names.append(repo["full_name"])
    stars.append(repo["stargazers_count"])
    urls.append(repo["html_url"])
    descriptions.append(repo["description"])

df = pd.DataFrame()
df["stars"] = stars
df["name"] = names
df["full_name"] = full_names
df["description"] = descriptions
df["url"] = urls

df.to_csv("projects.csv")

In [None]:
import requests
import time

# base url
BASE_URL = "https://api.github.com/search/repositories"

# GitHub personal access token (replace with your token)
TOKEN = os.getenv("GITHUB_TOKEN")

# Set up headers with authentication token
headers = {
    'Authorization': f'token {TOKEN}'
}

# Function to fetch repositories for a given star range
def fetch_repositories_by_stars(min_stars, max_stars):
    repositories = []
    page = 1
    per_page = 100  # Maximum allowed per page

    while True:
        params = {
            'q': f'stars:{min_stars}..{max_stars}',
            'sort': 'stars',
            'order': 'desc',
            'per_page': per_page,
            'page': page
        }

        response = requests.get(BASE_URL, headers=headers, params=params)

        # Check for errors
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.json()}")
            break

        data = response.json()
        repos = data.get('items', [])
        repositories.extend(repos)

        if len(repos) == 0 or len(repositories) >= 10000:
            break  # Stop if no more repositories are returned or we hit the 10,000 limit

        print(f"Fetched {len(repositories)} repositories with {min_stars}..{max_stars} stars...")

        page += 1

        # Respect GitHub's rate limits by sleeping for a short time
        time.sleep(2)

    return repositories

# Function to fetch the top 10,000 repositories by splitting star ranges
def fetch_top_repositories():
    repositories = []
    star_ranges = [
        (100000, 500000),  # Repositories with 500,000 to 100,000 stars
        (50000, 100000),  # Repositories with 50,000 to 100,000 stars
        (40000, 50000),   # Repositories with 40,000 to 50,000 stars
        (30000, 40000),   # Repositories with 30,000 to 40,000 stars
        (20000, 30000),   # Repositories with 20,000 to 30,000 stars
        (10000, 20000),   # Repositories with 10,000 to 20,000 stars
        (5000, 10000),    # Repositories with 5,000 to 10,000 stars
        (1000, 5000),     # Repositories with 1,000 to 5,000 stars
        (500, 1000),      # Repositories with 500 to 1,000 stars
    ]

    for min_stars, max_stars in star_ranges:
        if len(repositories) >= 10000:
            break

        # Fetch repositories within the star range
        repos = fetch_repositories_by_stars(min_stars, max_stars)
        repositories.extend(repos)

        # Stop once we've hit 10,000 repositories
        if len(repositories) >= 10000:
            break

    return repositories[:10000]

# Fetch the top 10,000 repositories
top_repositories = fetch_top_repositories()

In [5]:
import pandas as pd

df = pd.read_csv("../data/projects.csv")

print("Len repos: ", len(df))

Len repos:  5409
