In [None]:
import requests
import time
import os
import json
from dotenv import load_dotenv

# load env variables
load_dotenv(dotenv_path="../.env")

# base url
BASE_URL = "https://api.github.com/search/repositories"

TOKEN = ""

# Set up headers with authentication token
headers = {'Authorization': f'token {TOKEN}'}

# Function to fetch repositories for a given star range
def fetch_repositories_by_stars(min_stars, max_stars):
    repositories = []
    page = 1
    per_page = 100  # Maximum allowed per page

    while True:
        params = {
            'q': f'stars:{min_stars}..{max_stars}',
            'sort': 'stars',
            'order': 'desc',
            'per_page': per_page,
            'page': page
        }

        response = requests.get(BASE_URL, headers=headers, params=params)

        # Check for errors
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.json()}")
            break

        data = response.json()
        repos = data.get('items', [])
        repositories.extend(repos)

        if len(repos) == 0 or len(repositories) >= 10000:
            break  # Stop if no more repositories are returned or we hit the 10,000 limit

        print(f"Fetched {len(repositories)} repositories with {min_stars}..{max_stars} stars...")

        page += 1

        # Respect GitHub's rate limits by sleeping for a short time
        time.sleep(2)

    return repositories

# Function to fetch the top 10,000 repositories by splitting star ranges
def fetch_top_repositories():
    repositories = []
    star_ranges = [
        (400000, 500000), 
        (300000, 400000), 
        (200000, 300000), 
        (100000, 200000), 
        (90000, 100000),  
        (80000, 90000),   
        (70000, 80000),   
        (60000, 70000),   
        (50000, 60000),   
        (40000, 50000),   
        (30000, 40000),  
        (25000, 30000),   
        (20000, 25000),
        (15000, 20000),
        (12500, 15000),   
        (12000, 12500),
        (11500, 12000),
        (11000, 11500),
        (10500, 11000),
        (10000, 10500),
        (9500, 10000), 
        (9000, 9500),
        (8500, 9000),
        (8000, 8500),
        (7500, 8000),
        (7000, 7500),
        (6500, 7000),
        (6000, 6500),
        (5500, 6000),
        (5400, 5500),
        (5400, 5500),
        (5300, 5400),
        (5200, 5300),
        (5100, 5200),
        (5000, 5100),
        (4900, 5000),
        (4800, 4900),
        (4700, 4800),
        (4600, 4700),
        (4500, 4600),
        (4400, 4500),
        (4300, 4400),
        (4200, 4300),
        (4100, 4200),
        (4000, 4100),
        (3900, 4000),
        (3800, 3900),
        (3700, 3800),
        (3600, 3700),
        (3500, 3600),
        (3400, 3500),
        (3300, 3400),
        (3200, 3300),
        (3100, 3200),
        (3000, 3100),
    ]

    for min_stars, max_stars in star_ranges:
        if len(repositories) >= 10000:
            break

        # Fetch repositories within the star range
        repos = fetch_repositories_by_stars(min_stars, max_stars)
        repositories.extend(repos)

        # Stop once we've hit 10,000 repositories
        if len(repositories) >= 10000:
            break

    return repositories[:10000]

print("Token: ", TOKEN)

# Fetch the top 10,000 repositories
top_repositories = fetch_top_repositories()

with open("../data/projects_raw.json", "w", encoding="utf-8") as dest:
    json.dump(top_repositories, dest, indent=2)

In [37]:
len(top_repositories)

10000

In [38]:
top_repositories[0]

{'id': 28457823,
 'node_id': 'MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==',
 'name': 'freeCodeCamp',
 'full_name': 'freeCodeCamp/freeCodeCamp',
 'private': False,
 'owner': {'login': 'freeCodeCamp',
  'id': 9892522,
  'node_id': 'MDEyOk9yZ2FuaXphdGlvbjk4OTI1MjI=',
  'avatar_url': 'https://avatars.githubusercontent.com/u/9892522?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/freeCodeCamp',
  'html_url': 'https://github.com/freeCodeCamp',
  'followers_url': 'https://api.github.com/users/freeCodeCamp/followers',
  'following_url': 'https://api.github.com/users/freeCodeCamp/following{/other_user}',
  'gists_url': 'https://api.github.com/users/freeCodeCamp/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/freeCodeCamp/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/freeCodeCamp/subscriptions',
  'organizations_url': 'https://api.github.com/users/freeCodeCamp/orgs',
  'repos_url': 'https://api.github.com/users/freeCodeCamp/repos',
  'event

In [None]:
import json

# Load the large JSON file
with open('../data/projects_raw.json', 'r') as f:
    data = json.load(f)

# Calculate split size
num_splits = 10
split_size = len(data) // num_splits

# Write each split to a new JSON file
for i in range(num_splits):
    start = i * split_size
    # Ensure last split gets remaining items
    end = (i + 1) * split_size if i < num_splits - 1 else len(data)
    split_data = data[start:end]
    
    with open(f'../data/projects_raw_{i+1}.json', 'w') as f:
        json.dump(split_data, f, indent=2)

print("JSON file successfully split into 5 smaller files.")

JSON file successfully split into 5 smaller files.
