# About
This notebook queries the Ecosystem Dashboard to get a dump of repositories under the various PL GitHub organizations.

The Ecosystem Dashboard is queried because:
1. Avoids getting throttled by GitHub 
2. Doesn't require any API token setup
3. Has additional metadata about our repos

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import datetime
import ecosystem_dashboard_utils

In [None]:
report_date = datetime.date.today()
report_date_str = report_date.strftime("%Y-%m-%d")

In [None]:
# Fetch repo data from the Ecosystem Dasbhaord
ipfs_repos_path = f"ipfs-repos-{report_date_str}.json"
ecosystem_dashboard_utils.dump_api(unpaginated_url="https://ipfs.ecosystem-dashboard.com/repositories.json?", output_path=ipfs_repos_path)
filecoin_repos_path = f"filecoin-repos-{report_date_str}.json"
ecosystem_dashboard_utils.dump_api(unpaginated_url="https://filecoin.ecosystem-dashboard.com/repositories.json?", output_path=filecoin_repos_path)

In [None]:
# Read the data into Pandas
df = pd.read_json(ipfs_repos_path, orient='records').append(pd.read_json(filecoin_repos_path, orient='records'), ignore_index=True)
df = df.set_index("github_id")
df

In [None]:
# Put the most import columns first
p1_col_names = [
    'org',
    'full_name',
    'language',
    'score',
    'stargazers_count',
    'forks_count',
    'subscribers_count',
    'open_issues_count',
    'archived',
    'description',
]
p2_col_names = df.columns.to_list()
for p1_col_name in p1_col_names:
    p2_col_names.remove(p1_col_name)

ordered_col_names = []
ordered_col_names.extend(p1_col_names)
ordered_col_names.extend(p2_col_names)
ordered_col_names

In [None]:
df = df[ordered_col_names]
df

In [None]:
df = df.sort_values(by=["org", "language", "score", "full_name"], ascending=[True, True, False, True])
df

In [None]:
df.to_csv(f"pl-repos-cleaned-{report_date_str}.csv")

# Additional analsysis example: filter to active JS repos
Filter down to the list of repos that are JS-based

In [None]:
js_df = df[((df['language'] == "JavaScript") | (df['language'] == "TypeScript")) & (df['archived'] != True)]
js_df

In [None]:
js_df.to_csv(f"pl-js-active-repos-cleaned-{report_date_str}.csv")