From c0c155038bb8aeac6036e3236e0187e7490f8275 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 3 Feb 2022 21:21:33 -0800 Subject: [PATCH 1/5] Rm click (for now). --- issues.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/issues.py b/issues.py index 8809c3b..4deb636 100644 --- a/issues.py +++ b/issues.py @@ -1,7 +1,6 @@ import os import requests import json -import click token = os.environ["GRAPH_API_KEY"] endpoint = r"https://api.github.com/graphql" @@ -278,11 +277,6 @@ def dump(self, outfile): json.dump(self.raw_data, outf) -@click.command() -def cli(): - pass - - if __name__ == "__main__": grabber = GithubIssueGrabber('query_examples/issue_activity_since_date.gql') grabber.get() From d8cd711787c9527f47a1ffb83c72e218a88b69d0 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 3 Feb 2022 21:24:06 -0800 Subject: [PATCH 2/5] Remove issue-specific helper functions. --- issues.py | 136 ------------------------------------------------------ 1 file changed, 136 deletions(-) diff --git a/issues.py b/issues.py index 4deb636..a1c5a5e 100644 --- a/issues.py +++ b/issues.py @@ -89,142 +89,6 @@ def parse_single_issue_query(data): raise e return data, last_cursor, total_num_issues -def to_ndata(data): - """ - Parse the raw json returned by a GitHub GraphQL query into an issue - dictionary. - - Parameters - ---------- - data : dict - The result of `get_all_responses` or `parse_single_issue_query` - - Returns - ------- - ndata : dict - A dictionary of issues where the keys are the issue numbers. - This is the data format expected by subsequent filtering/summarizing - functions. - """ - # Dict & list comps to decompose json - ndata = { - n['node']['number'] : { - 'title' : n['node']['title'], - 'url' : n['node']['url'], - 'numrefs' : n['node']['timelineItems']['totalCount'], - 'labels' : [lbl['node']['name'] for lbl in n['node']['labels']['edges']] - } for n in data - } - return ndata - -def filter_issues_by_label(ndata, labels_to_filter=("Triaged",)): - """ - Remove nodes from parsed node data if the node's labels contain any of - the labels in labels_to_filter - - Parameters - ---------- - ndata : dict - Dictionary of node data parsed from query response. - Result of to_ndata(query_response) - - labels_to_filter : tuple - Tuple of strings containing the names of labels to filter by - - Returns - ------- - filtered_ndata : dict - Dictionary of node data with specified nodes filtered out. - """ - if type(labels_to_filter) is not tuple: - raise TypeError('labels_to_filter must be a tuple of strings') - lbls = set(labels_to_filter) - - return { - k : v for k, v in ndata.items() if len(lbls.intersection(set(v['labels']))) == 0 - } - -def filter_issues_apply_blacklist(ndata, blacklist): - """ - Remove nodes from parsed node data if the node's number (i.e. the issue - number) is on the blacklist. - - Parameters - ---------- - ndata : dict - Dictionary of node data parsed from query response. - Result of to_ndata(query_response) - - blacklist: tuple - Tuple of ints containing the issue IDs to filter. - - Returns - ------- - filtered_ndata : dict - Dictionary of node data with specified nodes filtered out. - """ - if type(blacklist) is not tuple: - raise TypeError('blacklist bust be a tuple of integers') - blacklist = set(blacklist) - - return { - k : v for k, v in ndata.items() if k not in blacklist - } - -def generate_table(ndata, idx, num_issues=10): - """ - Generate a markdown-formatted table from the first `num_issues` nodes - in `idx`. - """ - # Initialize table - mdtable = '|Iss. \#| xrefs | Issue |\n' - mdtable += '|:-----:|:------|:------|\n' - - for issue_num in idx[:num_issues]: - node = ndata[issue_num] - mdtable += '|{}|{}|[{}]({})\n'.format( - issue_num, - node['numrefs'], - node['title'], - node['url'] - ) - return mdtable - - -def generate_top_issues_summary(ndata, num_issues=10): - """ - Generate a markdown-formatted table of GitHub issues sorted by - cross-reference count. - - Parameters - ---------- - num_issues : int - Number of issues to include in the summary table. Default = 10. - ndata : dict - Dictionary of node data parsed from query repsonse. - Result of to_ndata(query_response) - - Returns - ------- - table : str - A string containing a markdown-formatted table with the issue number, - number of cross-references, and issue title/url. - """ - - # Initialize table - mdtable = '|Iss. \#| xrefs | Issue |\n' - mdtable += '|:-----:|:------|:------|\n' - - # Sort data by num xrefs and generate summary - for node in sorted(ndata.items(), key=lambda x: x[1]['numrefs'], reverse=True)[:num_issues]: - mdtable += '|{}|{}|[{}]({})\n'.format( - node[0], - node[1]['numrefs'], - node[1]['title'], - node[1]['url'] - ) - return mdtable - class GithubIssueGrabber: """ From d44388db2b064b7635115f3fd58b58f6baa8bead Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 3 Feb 2022 21:55:43 -0800 Subject: [PATCH 3/5] Pull in pr query from other branch. --- query_examples/pr_data_query.gql | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 query_examples/pr_data_query.gql diff --git a/query_examples/pr_data_query.gql b/query_examples/pr_data_query.gql new file mode 100644 index 0000000..55274db --- /dev/null +++ b/query_examples/pr_data_query.gql @@ -0,0 +1,32 @@ +query { + repository(owner:"_REPO_OWNER_", name:"_REPO_NAME_") { + pullRequests(first:100) { + totalCount + edges { + cursor + node{ + number + state + title + createdAt + baseRefName + mergeable + author{ + login + } + authorAssociation + mergedBy{ + login + } + mergedAt + reviews(states:APPROVED){ + totalCount + } + participants(first:100){ + totalCount + } + } + } + } + } +} From 9dfc0708b4d0c7f50352f8672afce41c2ced47d3 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 3 Feb 2022 21:56:51 -0800 Subject: [PATCH 4/5] Support issue and PR queries with the same interface. Needs a lot more work. --- issues.py | 107 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 80 insertions(+), 27 deletions(-) diff --git a/issues.py b/issues.py index a1c5a5e..8c18b2f 100644 --- a/issues.py +++ b/issues.py @@ -43,59 +43,94 @@ def load_query_from_file(fname, repo_owner="numpy", repo_name="numpy"): return query -def send_query(query, cursor=None): +def send_query(query, query_type, cursor=None): """ - Helper function to use the graphql query example in `query_examples` - to retrieve open numpy issues and all cross references + Send a GraphQL query via requests.post + + No validation is done on the query before sending. GitHub GraphQL is + supported with the `cursor` argument. + + Parameters + ---------- + query : str + The GraphQL query to be sent + query_type : {"issues", "pullRequests"} + The object being queried according to the GitHub GraphQL schema. + Currently only issues and pullRequests are supported + cursor : str, optional + If given, then the cursor is injected into the query to support + GitHub's GraphQL pagination. + + Returns + ------- + dict + The result of the query (json) parsed by `json.loads` + + Notes + ----- + This is intended mostly for internal use within `get_all_responses`. """ - # Modifications to request template - # TODO: Unhack this - # WARNING: This hack relies on specific structure of issues query + # TODO: Expand this, either by parsing the query type from the query + # directly or manually adding more query_types to the set + if query_type not in {"issues", "pullRequests"}: + raise ValueError( + "Only 'issues' and 'pullRequests' queries are currently supported" + ) + # TODO: Generalize this + # WARNING: The cursor injection depends on the specific structure of the + # query, this is the main reason why query types are limited to issues/PRs if cursor is not None: - cursor_ind = query.find("issues(") + len("issues(") + cursor_insertion_key = query_type + "(" + cursor_ind = query.find(cursor_insertion_key) + len(cursor_insertion_key) query = query[:cursor_ind] + f'after:"{cursor}", ' + query[cursor_ind:] # Build request payload payload = {'query' : ''.join(query.split('\n'))} response = requests.post(endpoint, json=payload, headers=headers) return json.loads(response.content) -def get_all_responses(query): +def get_all_responses(query, query_type): """ Helper function to bypass GitHub GraphQL API node limit. """ # Get data from a single response - initial_data = send_query(query) - data, last_cursor, total_num_issues = parse_single_issue_query(initial_data) - print("Retrieving {} out of {} values...".format(len(data), total_num_issues)) - # Continue requesting issues (with pagination) until all are acquired - while len(data) < total_num_issues: - rdata = send_query(query, cursor=last_cursor) - pdata, last_cursor, _ = parse_single_issue_query(rdata) + initial_data = send_query(query, query_type) + data, last_cursor, total_count = parse_single_query(initial_data, query_type) + print(f"Retrieving {len(data)} out of {total_count} values...") + # Continue requesting data (with pagination) until all are acquired + while len(data) < total_count: + rdata = send_query(query, query_type, cursor=last_cursor) + pdata, last_cursor, _ = parse_single_query(rdata, query_type) data.extend(pdata) - print("Retrieving {} out of {} values...".format(len(data), total_num_issues)) + print(f"Retrieving {len(data)} out of {total_count} values...") print("Done.") return data -def parse_single_issue_query(data): +def parse_single_query(data, query_type): """ - Parse the raw json returned by get_open_numpy_issues_with_crossrefs. + Parse the data returned by `send_query` + + .. warning:: + + Like `send_query`, the logic here depends on the specific structure + of the query (e.g. it must be an issue or PR query, and must have a + total count). """ try: - total_num_issues = data['data']['repository']['issues']['totalCount'] - data = data['data']['repository']['issues']['edges'] + total_count = data['data']['repository'][query_type]['totalCount'] + data = data['data']['repository'][query_type]['edges'] last_cursor = data[-1]['cursor'] except KeyError as e: print(data) raise e - return data, last_cursor, total_num_issues + return data, last_cursor, total_count -class GithubIssueGrabber: +class GithubGrabber: """ Pull down data via the GitHub APIv.4 given a valid GraphQL query. """ - def __init__(self, query_fname, repo_owner="numpy", repo_name="numpy"): + def __init__(self, query_fname, query_type, repo_owner="numpy", repo_name="numpy"): """ Create an object to send/recv queries related to the issue tracker for the given repository via the GitHub API v.4. @@ -108,12 +143,16 @@ def __init__(self, query_fname, repo_owner="numpy", repo_name="numpy"): query_fname : str Path to a valid GraphQL query conforming to the GitHub GraphQL schema + query_type : {"issues", "pullRequests"} + Type of object that is being queried according to the GitHub GraphQL + schema. Currently only "issues" and "pullRequests" are supported. repo_owner : str Repository owner. Default is "numpy" repo_name : str Repository name. Default is "numpy" """ self.query_fname = query_fname + self.query_type = query_type # TODO: Parse this directly from query self.repo_owner = repo_owner self.repo_name = repo_name self.raw_data = None @@ -128,7 +167,7 @@ def get(self): """ Get JSON-formatted raw data from the query. """ - self.raw_data = get_all_responses(self.query) + self.raw_data = get_all_responses(self.query, self.query_type) def dump(self, outfile): """ @@ -142,6 +181,20 @@ def dump(self, outfile): if __name__ == "__main__": - grabber = GithubIssueGrabber('query_examples/issue_activity_since_date.gql') - grabber.get() - grabber.dump("_data/issues.json") + repo = "networkx" + issues = GithubGrabber( + 'query_examples/issue_activity_since_date.gql', + 'issues', + repo_owner=repo, + repo_name=repo, + ) + issues.get() + issues.dump(f"_data/{repo}_issues.json") + prs = GithubGrabber( + 'query_examples/pr_data_query.gql', + 'pullRequests', + repo_owner=repo, + repo_name=repo, + ) + prs.get() + prs.dump(f"_data/{repo}_prs.json") From 11ec045c7295a832f3ca92a4b27aadb0e1d7b3cb Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 3 Feb 2022 21:57:29 -0800 Subject: [PATCH 5/5] Update filename to reflect more general nature. --- issues.py => query.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename issues.py => query.py (100%) diff --git a/issues.py b/query.py similarity index 100% rename from issues.py rename to query.py