In [64]:
import requests
import pathlib
import yaml
import re
import os
import sys

from json import load, dump
from typing import Set
from os.path import join, exists, basename
from tqdm import tqdm

In [16]:
TOKEN = input('github token')

In [72]:
with open('./resources/supernova.yaml', 'r') as fp:
    data = yaml.safe_load(fp)

In [73]:
repo_names = []
repo_name_map = {}

for item in data:
    if item['githubURL'] != '':
        parts = item['githubURL'].split('/')
        cur_repo_name = '/'.join(parts[-2:])
        repo_name_map[cur_repo_name] = item['name']
        repo_names.append(cur_repo_name)


In [70]:
def get_star_response(search_string):
    # Query the stars
    URL = "https://api.github.com/graphql"
    USER = "xiaohk"

    query = """
    query {
    rateLimit {
        limit
        cost
        remaining
    }
    search(query: "SEARCH_STRING", type: REPOSITORY, first: 100) {
        edges {
        node {
            ... on Repository {
            owner {
                login
            }
            name
            stargazers {
                totalCount
            }
            defaultBranchRef {
                target {
                    ... on Commit {
                    history(first: 1) {
                        edges {
                        node {
                            committedDate
                        }
                        }
                        totalCount
                        pageInfo {
                            endCursor
                        }
                    }
                    }
                }
            }
            }
        }
        }
    }
    }
    """
    query = query.replace("SEARCH_STRING", search_string)

    response = requests.post(
        URL, json={"query": query}, auth=(USER, TOKEN), timeout=600
    )

    if not response.ok:
        print("Request error!")
        print(response)
        print(response.text)

    return response

In [74]:
# Create a query search string to query only related users
i = 0
gap = 100
star_responses = []

while i < len(repo_names):
    search_string = ""
    for name in repo_names[i: i + gap]:
        search_string += f"repo:{name} "

    response = get_star_response(search_string)
    star_responses.append(response.json())
    i += gap

In [77]:
star_counts = {}
cursor_info = {}
repo_date_map = {}

def parse_star_response(star_response):
    # Parse the query result
    data = star_response["data"]


    for node in data["search"]["edges"]:
        node = node["node"]

        # Get the query results
        owner = node["owner"]["login"]
        name = node["name"]
        star = node["stargazers"]["totalCount"]

        repo_name = f"{owner}/{name}"
        star_counts[repo_name] = star

        # Get the cursor info
        info = {
            'committedDate': node["defaultBranchRef"]["target"]['history']["edges"][0]["node"]["committedDate"],
            'totalCount': node["defaultBranchRef"]["target"]['history']["totalCount"],
            'endCursor': node["defaultBranchRef"]["target"]['history']["pageInfo"]["endCursor"]
        }
        cursor_info[repo_name] = info

for star_response in star_responses:
    parse_star_response(star_response)

In [81]:
# Get the first commit date
query = """
query ($name: String!, $owner: String!){
  repository(name: $name, owner: $owner) {
    defaultBranchRef {
      target {
        ... on Commit {
          history(first: 1, after: %s) {
            nodes {
              message
              committedDate
              authoredDate
              oid
              author {
                email
                name
              }
            }
            totalCount
            pageInfo {
              endCursor
            }
          }
        }
      }
    }
  }
}
"""

def getDate(cursor):
    r = requests.post("https://api.github.com/graphql",
        headers = {
            "Authorization": f"Bearer {TOKEN}"
        },
        json = {
            "query": query % cursor,
            "variables": {
                "name": name,
                "owner": owner,
            }
        })
    return r.json()["data"]["repository"]["defaultBranchRef"]["target"]["history"]["nodes"][0]["committedDate"]

for key in tqdm(cursor_info):
  if key in repo_date_map:
    continue

  info = cursor_info[key]
  owner, name = key.split('/')
  totalCount = info['totalCount']

  if totalCount > 1:
    cursor = info['endCursor'].split(' ')
    cursor[1] = str(totalCount - 2)
    date = getDate(f"\"{' '.join(cursor)}\"")
    repo_date_map[key] = date
  else:
    repo_date_map[key] = info['committedDate']

100%|██████████| 135/135 [00:08<00:00, 15.11it/s]


In [85]:
# Remap the repo name back to their data source name

star_info = {}
for key in star_counts:
    name = repo_name_map[key]
    star_info[name] = {
        'star': star_counts[key],
        'date': repo_date_map[key]
    }

In [87]:
star_info

dump(star_info, open('./resources/star_info.json', 'w'))