# Purpose
This is a notebook for grabbing all the Github event data that the ecosystem dashboard has for our "spec and improvement proposal" repos.

This is useful if you want to analyze GitHub actions beyond opening PRs and issues.  It will also show comments, PR/issue closing, etc.

It was used as part of the PL EngRes summit to populate the "Network Native Development" slide: https://docs.google.com/presentation/d/1dRgEgEpR2htMgyIVXG0fwhBMVwnAsEtXNfvrmzHTqfI/edit#slide=id.g14b7a7f445c_0_476

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import datetime
import ecosystem_dashboard_utils

In [None]:
report_date = datetime.date.today()
report_date_str = report_date.strftime("%Y-%m-%d")

from datetime import date
analysis_start_date = date(2022, 1, 1) # Adjust for how far back you want to look
number_of_days = (report_date - analysis_start_date).days

In [None]:
# Ecosystem dashboard URLs are generated based on off of these
repo_configs = [
    {
        "ecosystem" : "ipfs",
        "org" : "ipfs",
        "repo" : "specs"
    }, 
    {
        "ecosystem" : "ipfs",
        "org" : "libp2p",
        "repo" : "specs"
    },
    {
        "ecosystem" : "filecoin",
        "org" : "filecoin-project",
        "repo" : "FIPs"
    },
]

In [None]:
# The "payload" field add a bunch of data that we don't need so we strip it out to make the json more wieldly to consume
def payload_filter(x): 
    del x["payload"]
    return x

for repo_config in repo_configs:
    ecosystem = repo_config["ecosystem"]
    org = repo_config["org"]
    repo = repo_config["repo"]
    events_path = f"{org}-{repo}-events-{report_date_str}.json"
    ecosystem_dashboard_utils.dump_api(unpaginated_url=f"https://{ecosystem}.ecosystem-dashboard.com/events.json?range={number_of_days}&repo_full_name={org}%2F{repo}&", output_path=events_path, filter=payload_filter, page_size=200)

In [None]:
# Read everything back in that was persisted to disk

# report_date_str = "2022-05-19"
df = pd.DataFrame()

for repo_config in repo_configs:
    ecosystem = repo_config["ecosystem"]
    org = repo_config["org"]
    repo = repo_config["repo"]
    events_path = f"{org}-{repo}-events-{report_date_str}.json"
    ecosystem_df = pd.read_json(events_path, orient='records')
    ecosystem_df = ecosystem_df.set_index("github_id")
    df = pd.concat([df, ecosystem_df])

In [None]:
# Combine the event and action columns for analysis later.
df["event/action"] = df.apply(lambda x: x["event_type"] + "/" + x["action"] if x["action"] else x["event_type"], axis=1)
df

In [None]:
# Classify the event/action based on what time of behavior it signals.

event_action_classifications = {
    "IssuesEvent/opened" : "issue_engagement",
    "IssueCommentEvent/created" : "issue_engagement",
    "PullRequestReviewCommentEvent/created" : "code_review_engagement",
    "PullRequestReviewEvent/created" : "code_review_engagement",
    "PullRequestEvent/opened" : "code_creation",
    "PushEvent" : "code_creation",
}
df["event_action_classification"] = df["event/action"].map(event_action_classifications)
df

In [None]:
# Sort the column names.
p1_col_names = [
    'org',
    'repository_full_name',
    'actor',
    'event/action',
    "event_action_classification"
]
p2_col_names = df.columns.to_list()
for p1_col_name in p1_col_names:
    p2_col_names.remove(p1_col_name)

ordered_col_names = []
ordered_col_names.extend(p1_col_names)
ordered_col_names.extend(p2_col_names)
ordered_col_names

In [None]:
df = df[ordered_col_names]
df

In [None]:
# Get a summary of the number of "actors" for a given type of activity.

# https://stackoverflow.com/questions/12860421/how-to-aggregate-unique-count-with-pandas-pivot-table
table = pd.pivot_table(df, values='actor', index=["repository_full_name", "event_action_classification"], aggfunc=pd.Series.nunique, fill_value=0)
table

In [None]:
# Collect stats on PRs opened and closed

table = pd.pivot_table(df.loc[df['event_type'] == "PullRequestEvent"], values='id', index=["repository_full_name", "event/action"], aggfunc="count", fill_value=0)
table

In [None]:
# Useful debugging for what kind of event/actions are most popular.

table = pd.pivot_table(df, values='id', index=['event/action'], aggfunc="count", fill_value=0)
table

In [None]:
# Inspect how much activity indvidauls are having.
table = pd.pivot_table(df, values='id', index=['actor'], aggfunc="count", fill_value=0)
table.sort_values("id")

In [None]:
# Dump the event data for additional analysis
df.to_csv(f"spec-github-activity-events-combined-cleaned-{report_date_str}.csv")

# Output
You now have tabular data for each event, which makes it easy to create pivot tables to summarize how many actions a given user took.