In [24]:
import requests
import pandas as pd
import time

In [25]:
GITHUB_TOKEN = ''
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

In [28]:
def check_rate_limit():
    url = "https://api.github.com/rate_limit"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        remaining = data['rate']['remaining']
        reset_time = data['rate']['reset']
        return remaining, reset_time
    return 0, 0

In [29]:
def handle_rate_limit():
    remaining, reset_time = check_rate_limit()
    if remaining == 0:
        wait_time = reset_time - time.time()
        print(f"Rate limit exceeded. Waiting for {wait_time/60:.2f} minutes.")
        time.sleep(wait_time + 10)  # Add a little buffer to avoid hitting it immediately again
    else:
        print(f"Remaining requests: {remaining}")

In [30]:
def clean_company_name(company):
    if company:
        company = company.strip()  
        if company.startswith('@'):
            company = company[1:] 
        return company.upper() 
    return None

In [32]:
def get_users_in_city(city, min_followers):
    users = []
    page = 1
    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            break
        data = response.json()
        users.extend(data['items'])
        if len(data['items']) == 0:
            break
        page += 1
        time.sleep(1)  
    return users

In [33]:
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    return None

In [34]:
def get_user_repos(username):
    repos = []
    page = 1
    while True:
        url = f"https://api.github.com/users/{username}/repos?per_page=100&page={page}&sort=pushed"
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            break
        data = response.json()
        repos.extend(data)
        if len(data) == 0:
            break
        page += 1
        time.sleep(1)
    return repos[:500] 

In [36]:
def main():
    city = "Melbourne"
    min_followers = 100

    users = get_users_in_city(city, min_followers)
    user_data = []
    repo_data = []

    for user in users:
        handle_rate_limit()
        user_details = get_user_details(user['login'])
        if user_details:
            user_info = {
                'login': user_details['login'],
                'name': user_details['name'],
                'company': clean_company_name(user_details['company']),
                'location': user_details['location'],
                'email': user_details['email'],
                'hireable': user_details['hireable'],
                'bio': user_details['bio'],
                'public_repos': user_details['public_repos'],
                'followers': user_details['followers'],
                'following': user_details['following'],
                'created_at': user_details['created_at']
            }
            user_data.append(user_info)
            handle_rate_limit()

            # Fetch repositories for the user
            repos = get_user_repos(user['login'])
            for repo in repos:
                repo_info = {
                    'login': user_details['login'],
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'],
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['name'] if repo['license'] else None
                }
                repo_data.append(repo_info)

    df_users = pd.DataFrame(user_data)
    df_users.to_csv('users.csv', index=False)

    df_repos = pd.DataFrame(repo_data)
    df_repos.to_csv('repositories.csv', index=False)

    print("Data saved to 'users.csv' and 'repositories.csv'")


In [37]:
if __name__ == "__main__":
    main()

Remaining requests: 4876
Remaining requests: 4875
Remaining requests: 4873
Remaining requests: 4872
Remaining requests: 4870
Remaining requests: 4869
Remaining requests: 4864
Remaining requests: 4863
Remaining requests: 4856
Remaining requests: 4855
Remaining requests: 4853
Remaining requests: 4852
Remaining requests: 4846
Remaining requests: 4845
Remaining requests: 4841
Remaining requests: 4840
Remaining requests: 4838
Remaining requests: 4837
Remaining requests: 4835
Remaining requests: 4834
Remaining requests: 4830
Remaining requests: 4829
Remaining requests: 4826
Remaining requests: 4825
Remaining requests: 4823
Remaining requests: 4822
Remaining requests: 4820
Remaining requests: 4819
Remaining requests: 4816
Remaining requests: 4815
Remaining requests: 4812
Remaining requests: 4811
Remaining requests: 4807
Remaining requests: 4806
Remaining requests: 4804
Remaining requests: 4803
Remaining requests: 4800
Remaining requests: 4799
Remaining requests: 4794
Remaining requests: 4793
