<a href="https://colab.research.google.com/github/sidg75/tds-project1/blob/main/tds_project1_git_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import csv

# GitHub API token
GITHUB_TOKEN = 'REMOVED' # Refer sidg75 github profile to know the code


# API URLs
SEARCH_URL = 'https://api.github.com/search/users'
USER_URL = 'https://api.github.com/users/'
REPO_URL = 'https://api.github.com/users/{}/repos'

# Headers for authorization
HEADERS = {
    'Authorization': f'Bearer {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github+json'
}

# Function to clean up company names
def clean_company(company):
    if company:
        company = company.strip()  # Trim whitespace
        if company.startswith('@'):
            company = company[1:]  # Remove leading '@'
        company = company.upper()  # Convert to uppercase
    return company

# Function to write data to CSV
def write_to_csv(lst, file_name, header):
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header row
        writer.writerows(lst)    # Write the data rows

# Fetch users based on location and follower criteria
def fetch_users(loc, followers):
    page = 1
    users = []

    while True:
        params = {
            'q': f'location:Sydney followers:>100',  # Dynamic follower count
            'per_page': 100,
            'page': page
        }

        response = requests.get(SEARCH_URL, headers=HEADERS, params=params)

        if response.status_code == 200:

            result = response.json()
            print(result)
            print(f"Fetched {len(result['items'])} users on page {page}")
            users.extend(result['items'])  # Add the users to the list

            if len(result['items']) < 100:
                break  # No more pages, stop pagination

            page += 1  # Go to the next page
        else:
            print(f"Failed to fetch users: {response.status_code}, {response.text}")
            break

    return users

# Fetch detailed user info
def get_user_details(loginid):
    response = requests.get(f'https://api.github.com/users/{loginid}', headers=HEADERS)

    if response.status_code == 200:
        user_data = response.json()
        return [
            user_data.get('login', ''),
            user_data.get('name', ''),
            clean_company(user_data.get('company', '')),
            user_data.get('location', ''),
            user_data.get('email', ''),
            str(user_data.get('hireable', '')).lower(),
            user_data.get('bio', ''),
            user_data.get('public_repos', 0),
            user_data.get('followers', 0),
            user_data.get('following', 0),
            user_data.get('created_at', '')
        ]
    else:
        print(f"Failed to fetch user details for {loginid}: {response.status_code}")
        return None

# Fetch repository details for each user
# Function to fetch repositories for a user, paginated to 500
def get_user_repo_details(loginid):
    all_repo_data = []
    page = 1
    per_page = 100

    while len(all_repo_data) < 500:  # Limit to 500 repos
        response = requests.get(f'https://api.github.com/users/{loginid}/repos', headers=HEADERS, params={'per_page': per_page, 'page': page})

        if response.status_code == 200:
            repo_data = response.json()

            if not repo_data:  # No more repos
                break

            for repo in repo_data:
                license_info = repo.get('license')
                license_name = license_info['key'] if license_info else None

                all_repo_data.append([
                    loginid,
                    repo.get('full_name', ''),
                    repo.get('created_at', ''),
                    repo.get('stargazers_count', 0),
                    repo.get('watchers_count', 0),
                    repo.get('language', ''),
                    str(repo.get('has_projects', '')).lower(),
                    str(repo.get('has_wiki', '')).lower(),
                    license_name
                ])

            page += 1  # Go to the next page

            if len(repo_data) < per_page:  # Fewer than 100 repos on this page, stop
                break
        else:
            print(f"Failed to fetch repos for {loginid}: {response.status_code}")
            break

    return all_repo_data[:500]  # Return up to 500 repos


# Main function
def main():
    # Fetch all users based on location and followers
    users = fetch_users('Sydney', 100)

    # Store user details
    user_details = []
    repo_details = []

    for user in users:
        login = user.get('login')

        # Fetch and store user details
        user_detail = get_user_details(login)
        if user_detail:
            user_details.append(user_detail)

        # Fetch and store repo details
        repo_data = get_user_repo_details(login)
        if repo_data:
            repo_details.extend(repo_data)  # Extend the list with all repos

    # Write users data to CSV
    user_header = ['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at']
    write_to_csv(user_details, 'users.csv', user_header)

    # Write repo data to CSV
    repo_header = ['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name']
    write_to_csv(repo_details, 'repositories.csv', repo_header)

if __name__ == '__main__':
    main()


{'total_count': 371, 'incomplete_results': False, 'items': [{'login': 'nicknochnack', 'id': 5948934, 'node_id': 'MDQ6VXNlcjU5NDg5MzQ=', 'avatar_url': 'https://avatars.githubusercontent.com/u/5948934?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/nicknochnack', 'html_url': 'https://github.com/nicknochnack', 'followers_url': 'https://api.github.com/users/nicknochnack/followers', 'following_url': 'https://api.github.com/users/nicknochnack/following{/other_user}', 'gists_url': 'https://api.github.com/users/nicknochnack/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/nicknochnack/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/nicknochnack/subscriptions', 'organizations_url': 'https://api.github.com/users/nicknochnack/orgs', 'repos_url': 'https://api.github.com/users/nicknochnack/repos', 'events_url': 'https://api.github.com/users/nicknochnack/events{/privacy}', 'received_events_url': 'https://api.github.com/users/nicknochnack/receiv