# Purpose
This is a sample notebook for grabbing all the Github event data that the ecosystem dashboard has.

This is useful if you want to analyze GitHub actions beyond opening PRs and issues.  It will also show comments, PR/issue closing, etc.

This was originally put together to help with identifying top contributors on GitHub that should likely be invited to IPFS Camp 2022.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import datetime
import ecosystem_dashboard_utils

In [None]:
report_date = datetime.date.today()
report_date_str = report_date.strftime("%Y-%m-%d")

from datetime import date
analysis_start_date = date(2022, 9, 11) # Adjust for how far back you want to look
number_of_days = (report_date - analysis_start_date).days

In [None]:
# The "payload" field add a bunch of data that we don't need so we strip it out to make the json more wieldly to consume
def payload_filter(x): 
    del x["payload"]
    return x
for ecosystem in ["ipfs"]: # You could add "filecoin"
    for org in ["ipfs", "ipfs-shipyard"]:
        events_path = f"{ecosystem}-{org}-events-{report_date_str}.json"
        ecosystem_dashboard_utils.dump_api(unpaginated_url=f"https://{ecosystem}.ecosystem-dashboard.com/events.json?range={number_of_days}&org={org}&", output_path=events_path, filter=payload_filter, page_size=500)

In [None]:
# report_date_str = "2022-05-19"
df = pd.DataFrame()
for ecosystem in ["ipfs"]:
    for org in ["ipfs", "ipfs-shipyard"]:
        events_path = f"{ecosystem}-{org}-events-{report_date_str}.json"
        ecosystem_df = pd.read_json(events_path, orient='records')
        if ecosystem_df.size == 0:
            continue
        ecosystem_df = ecosystem_df.set_index("github_id")
        df = pd.concat([df, ecosystem_df])

# Remove duplciate event.
# This is needed since there are duplciate repositories in filecoin and ipfs ecosystem dashboards.
# https://stackoverflow.com/questions/13035764/remove-pandas-rows-with-duplicate-indices
df = df[~df.index.duplicated(keep='first')]
df

In [None]:
df["event/action"] = df.apply(lambda x: x["event_type"] + "/" + x["action"] if x["action"] else x["event_type"], axis=1)
df

In [None]:
p1_col_names = [
    'org',
    'repository_full_name',
    'actor',
    'event/action',
]
p2_col_names = df.columns.to_list()
for p1_col_name in p1_col_names:
    p2_col_names.remove(p1_col_name)

ordered_col_names = []
ordered_col_names.extend(p1_col_names)
ordered_col_names.extend(p2_col_names)
ordered_col_names

In [None]:
df = df[ordered_col_names]
df

In [None]:
df.to_csv(f"events-combined-cleaned-{report_date_str}.csv")

# Output
You now have tabular data for each event, which makes it easy to create pivot tables to summarize how many actions a given user took.