<a href="https://colab.research.google.com/github/sidg75/tds-project1/blob/main/TDSGitHubGraphQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import requests
import time
import os


# GitHub GraphQL API URL
GITHUB_GRAPHQL_URL = "https://api.github.com/graphql"

# Your GitHub Personal Access Token (replace 'your_token' with your actual token)
TOKEN = 'refer github profile>settings for token'


# GraphQL headers with your personal access token
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json"
}

# GraphQL query to search for users in Sydney with more than 100 followers
USER_SEARCH_QUERY = """
query($location: String!, $after: String) {
  search(query: $location, type: USER, first: 50, after: $after) {
    userCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on User {
          login
          name
          company
          location
          email
          isHireable
          bio
          publicRepositories: repositories {
            totalCount
          }
          followers {
            totalCount
          }
          following {
            totalCount
          }
          createdAt
        }
      }
    }
  }
}
"""

# GraphQL query to get up to 500 repositories for a given user
USER_REPOS_QUERY = """
query($login: String!, $after: String) {
  user(login: $login) {
    repositories(first: 100, after: $after, ownerAffiliations: OWNER) {
      pageInfo {
        endCursor
        hasNextPage
      }
      nodes {
        name
        createdAt
        stargazerCount
        watchers {
          totalCount
        }
        primaryLanguage {
          name
        }
        hasProjectsEnabled
        hasWikiEnabled
        licenseInfo {
          key
        }
      }
    }
  }
}
"""

def run_query(query, variables):
    """Run a GraphQL query with variables, with retry logic for rate limits."""
    while True:
        response = requests.post(
            GITHUB_GRAPHQL_URL,
            json={"query": query, "variables": variables},
            headers=HEADERS
        )
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 403:  # Rate limit exceeded
            remaining = response.headers.get("X-RateLimit-Remaining", 0)
            if remaining == "0":
                reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
                wait_time = max(0, reset_time - time.time())
                print(f"Rate limit exceeded. Waiting for {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                time.sleep(5)  # Wait briefly if there's another error
        else:
            raise Exception(f"Query failed with status code {response.status_code}: {response.text}")


def clean_company_name(company):
    """Clean and format company name: strip whitespace, remove '@', and convert to uppercase."""
    if company:
        return company.strip().lstrip('@').upper()
    return None


def get_users_in_sydney(min_followers=100):
    """Search for GitHub users in Sydney with more than min_followers."""
    users = []
    after_cursor = None
    search_query = f'location:"Toronto" followers:>{min_followers}'  # Updated search query

    while True:
        variables = {"location": search_query, "after": after_cursor}
        result = run_query(USER_SEARCH_QUERY, variables)

        if 'errors' in result:
            print(f"Error: {result['errors']}")
            break

        if 'data' in result and 'search' in result['data'] and 'edges' in result['data']['search']:
            users_data = result['data']['search']['edges']
        else:
            print("No users found or unexpected data structure.")
            break

        for user in users_data:
            user_node = user.get('node', {})
            print(user_node)
            # Ensure 'login' exists before accessing it
            if 'login' not in user_node:
                print(f"Skipping user due to missing 'login' field: {user_node}")
                continue

            #company = clean_company_name(user_node.get('company'))
            company = user_node.get('company')
            users.append({
                "login": user_node.get('login'),
                "name": user_node.get('name'),
                "company": company,
                "location": user_node.get('location'),
                "email": user_node.get('email'),
                "hireable": user_node.get('isHireable'),
                "bio": user_node.get('bio'),
                "public_repos": user_node['publicRepositories']['totalCount'],
                "followers": user_node['followers']['totalCount'],
                "following": user_node['following']['totalCount'],
                "created_at": user_node['createdAt']
            })

        page_info = result['data']['search']['pageInfo']
        if page_info['hasNextPage']:
            after_cursor = page_info['endCursor']
        else:
            break

    # Write users data to CSV
    with open('users.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(users)
    return users


def get_repositories_for_user(login):
    """Retrieve up to 500 repositories for a given user."""
    repos = []
    after_cursor = None
    while len(repos) < 500:
        variables = {"login": login, "after": after_cursor}
        result = run_query(USER_REPOS_QUERY, variables)

        # Debugging print to inspect the result structure
        if 'data' not in result or 'user' not in result['data'] or not result['data']['user']:
            print(f"Skipping repositories for {login} due to missing data. Result: {result}")
            break

        repos_data = result['data']['user']['repositories']['nodes']

        for repo in repos_data:
            repos.append({
                "login": login,
                "full_name": repo['name'],
                "created_at": repo['createdAt'],
                "stargazers_count": repo['stargazerCount'],
                "watchers_count": repo['watchers']['totalCount'],
                "language": repo['primaryLanguage']['name'] if repo['primaryLanguage'] else None,
                "has_projects": repo['hasProjectsEnabled'],
                "has_wiki": repo['hasWikiEnabled'],
                "license_name": repo['licenseInfo']['key'] if repo['licenseInfo'] else None
            })

        page_info = result['data']['user']['repositories']['pageInfo']
        if not page_info['hasNextPage'] or len(repos) >= 500:
            break
        after_cursor = page_info['endCursor']
    return repos


def main():
    users = get_users_in_sydney(min_followers=100)
    print(f"Found {len(users)} users in Sydney with more than 100 followers.\n")

    """
    # Open a single file to write all repositories
    with open('repositories.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for user in users:
            print(f"Getting repositories for {user['login']} (Followers: {user['followers']})...")
            repos = get_repositories_for_user(user['login'])

            for repo in repos:
                writer.writerow(repo)
                print(f"- {repo['full_name']} (Stars: {repo['stargazers_count']}, Watchers: {repo['watchers_count']})")
            print()
    """
if __name__ == "__main__":
    main()
