# GA4GH GitHub Commit Statistics

Expects github-stats-summary.tsv to exist already; see Makefile for generation

This code counts commits in ga4gh-org repos only. Repos outside the org, and no comments and issues are counted anywhere.

### Stats over time
- [ ] num of repos
- [ ] num committers, per repo
- [ ] num commits, per repo

### Committers
- [ ] contributor longevity
- [ ] commits


## Bugs
- [ ] email addresses are not transformed to unique humans.  e.g., reece@harts.net and reecehart@gmail.com are the same people, 102165525+wesleygoar@users.noreply.github.com and wesley.goar@nationwidechildrens.org are the same people, and 49699333+dependabot[bot]@users.noreply.github.com is a bot


## Setup and read data

In [1]:
import datetime
from datetime import datetime

import pandas as pd
import pytz

stats_fn = "github-stats-summary.tsv"
forks_fn = "forks"
now = datetime.now(tz=pytz.UTC)

fork_repos = set(open(forks_fn).read().split())

In [7]:
df = pd.read_csv(
    stats_fn,
    delimiter="\t",
    parse_dates=["ts"],
    keep_default_na=False,
    converters={
        "files_changed": lambda x: int(x or 0),
        "insertions": lambda x: int(x or 0),
        "deletions": lambda x: int(x or 0),
    },
)

df = df[~df["repo"].isin(fork_repos)]

df["ts_YM"] = df["ts"].apply(lambda ts: ts.strftime("%Y-%m"))
df.insert(2, "ts_YM", df.pop("ts_YM"))   # move ts_YM to right of ts
df["ts_Y"] = df["ts"].apply(lambda ts: ts.strftime("%Y"))
df.insert(2, "ts_Y", df.pop("ts_Y"))     # move ts_Y to right of ts
df.pop("hash")
df.pop("committer_email")

df

Unnamed: 0,repo,ts,ts_Y,ts_YM,author_email,files_changed,insertions,deletions,subject
0,ga4gh/ADA-M,2019-01-21 12:41:00-05:00,2019,2019-01,mirocupak@gmail.com,1,9,30,Remove released build dependencies
1,ga4gh/ADA-M,2018-11-28 17:04:29-05:00,2018,2018-11,fjeanson@yahoo.com,1,1,1,fixed wagger parser version error
2,ga4gh/ADA-M,2018-11-28 16:57:17-05:00,2018,2018-11,fjeanson@yahoo.com,1,1,1,fixed swagger-core clone version
3,ga4gh/ADA-M,2018-10-03 13:07:08-04:00,2018,2018-10,fjeanson@yahoo.com,1,1,1,updated .travis.yml swagger-core to 2.0.2
4,ga4gh/ADA-M,2018-04-19 16:28:57-04:00,2018,2018-04,mirocupak@gmail.com,1,0,2,Remove mention of Protocol Buffers
...,...,...,...,...,...,...,...,...,...
19883,ga4gh/workflow-execution-service-schemas,2016-04-05 16:55:50-04:00,2016,2016-04,briandoconnor@gmail.com,1,1,1,working on first pass at API
19884,ga4gh/workflow-execution-service-schemas,2016-04-05 16:53:01-04:00,2016,2016-04,briandoconnor@gmail.com,1,1,1,working on first pass at API
19885,ga4gh/workflow-execution-service-schemas,2016-04-05 16:50:44-04:00,2016,2016-04,briandoconnor@gmail.com,1,1,1,working on first pass at API
19886,ga4gh/workflow-execution-service-schemas,2016-04-05 16:31:53-04:00,2016,2016-04,briandoconnor@gmail.com,3,592,1,"initial checkin, a work in progress"


## Overview stats

In [3]:
stats = pd.Series({
    "number of repos": df["repo"].nunique(),
    "number of commits": len(df),
    "number of unique authors": df["author_email"].nunique()
})
stats

number of repos              105
number of commits           9401
number of unique authors     278
dtype: int64

## Per-Repo Stats

In [4]:
df_repo = df.groupby(["repo"], as_index=False).agg(
    min_ts = pd.NamedAgg(column="ts", aggfunc=min),
    num_commits = pd.NamedAgg(column="repo", aggfunc=len),
    num_uniq_authors = pd.NamedAgg(column="author_email", aggfunc=pd.Series.nunique)
)
df_repo["age"] = now - df_repo["min_ts"]
df_repo.insert(2, "age", df_repo.pop("age"))
df_repo

Unnamed: 0,repo,min_ts,age,num_commits,num_uniq_authors
0,ga4gh/ADA-M,2017-05-15 14:29:11-07:00,2273 days 22:57:35.065614,21,4
1,ga4gh/Get-Started-with-GA4GH-APIs,2022-02-09 09:26:47-05:00,543 days 05:59:59.065614,142,8
2,ga4gh/Strategic-Refresh,2022-11-30 11:17:47-05:00,249 days 04:08:59.065614,6,2
3,ga4gh/TASC,2020-01-16 23:40:40+02:00,1297 days 22:46:06.065614,35,10
4,ga4gh/approval-tracker,2018-07-05 13:32:31+01:00,1858 days 07:54:15.065614,15,2
...,...,...,...,...,...
100,ga4gh/vrsatile,2021-02-17 10:23:33-05:00,900 days 05:03:13.065614,111,3
101,ga4gh/vrsatile-pydantic,2021-08-25 16:11:03-04:00,711 days 00:15:43.065614,44,4
102,ga4gh/wiki,2017-08-03 11:48:15-07:00,2194 days 01:38:31.065614,2,1
103,ga4gh/workflow-execution-server,2016-03-21 16:26:44-07:00,2693 days 21:00:02.065614,1,1


### Top repos, by commits

In [5]:
df_repo.sort_values(by="num_commits", ascending=False).head(20)

Unnamed: 0,repo,min_ts,age,num_commits,num_uniq_authors
39,ga4gh/ga4gh-server,2014-08-12 08:31:49-07:00,3281 days 04:54:57.065614,1057,54
93,ga4gh/vrs,2016-08-30 22:12:32-07:00,2531 days 15:14:14.065614,886,18
38,ga4gh/ga4gh-schemas,2014-03-19 17:13:40-07:00,3426 days 20:13:06.065614,752,57
98,ga4gh/vrs-python,2017-12-28 12:10:27-08:00,2047 days 00:16:19.065614,516,15
73,ga4gh/refget-compliance,2018-09-18 17:40:20+01:00,1783 days 03:46:26.065614,441,5
15,ga4gh/data-repository-service-schemas,2017-02-26 19:52:33-08:00,2351 days 16:34:13.065614,388,21
16,ga4gh/data-security,2019-03-05 09:15:41+00:00,1615 days 11:11:05.065614,370,15
12,ga4gh/compliance,2014-08-19 15:22:15-07:00,3273 days 22:04:31.065614,304,22
63,ga4gh/htsget-refserver,2019-09-10 12:09:33-04:00,1426 days 04:17:13.065614,289,7
9,ga4gh/cloud-interop-testing,2018-02-08 12:47:46-08:00,2004 days 23:39:00.065614,287,11


### Top repos, by unique authors

In [6]:
df_repo.sort_values(by="num_uniq_authors", ascending=False).head(20)

Unnamed: 0,repo,min_ts,age,num_commits,num_uniq_authors
38,ga4gh/ga4gh-schemas,2014-03-19 17:13:40-07:00,3426 days 20:13:06.065614,752,57
39,ga4gh/ga4gh-server,2014-08-12 08:31:49-07:00,3281 days 04:54:57.065614,1057,54
104,ga4gh/workflow-execution-service-schemas,2016-04-05 14:59:37-04:00,2679 days 01:27:09.065614,236,22
12,ga4gh/compliance,2014-08-19 15:22:15-07:00,3273 days 22:04:31.065614,304,22
15,ga4gh/data-repository-service-schemas,2017-02-26 19:52:33-08:00,2351 days 16:34:13.065614,388,21
89,ga4gh/tool-registry-service-schemas,2015-12-15 10:39:20-05:00,2791 days 04:47:26.065614,178,19
93,ga4gh/vrs,2016-08-30 22:12:32-07:00,2531 days 15:14:14.065614,886,18
84,ga4gh/task-execution-schemas,2016-03-29 09:55:07-07:00,2686 days 03:31:39.065614,179,17
68,ga4gh/mme-apis,2014-11-13 12:09:59-05:00,3188 days 03:16:47.065614,140,16
16,ga4gh/data-security,2019-03-05 09:15:41+00:00,1615 days 11:11:05.065614,370,15


# Commiter Stats

In [10]:
df_author = df.groupby(["author_email"], as_index=False).agg(
    min_ts = pd.NamedAgg(column="ts", aggfunc=min),
    num_commits = pd.NamedAgg(column="author_email", aggfunc=len),
    num_uniq_repos = pd.NamedAgg(column="repo", aggfunc=pd.Series.nunique)
)
df_author["age"] = now - df_author["min_ts"]
df_author.insert(2, "age", df_author.pop("age"))
df_author

Unnamed: 0,author_email,min_ts,age,num_commits,num_uniq_repos
0,102165525+wesleygoar@users.noreply.github.com,2022-04-18 10:04:39-05:00,475 days 05:22:07.065614,5,1
1,133602264+bea-amos@users.noreply.github.com,2023-08-02 12:46:33+01:00,4 days 08:40:13.065614,1,1
2,25033562+helensch@users.noreply.github.com,2019-06-09 19:26:19+01:00,1519 days 02:00:27.065614,10,1
3,27856297+dependabot-preview[bot]@users.noreply...,2019-07-11 11:00:30-04:00,1487 days 05:26:16.065614,7,1
4,31826250+david-xliu@users.noreply.github.com,2019-09-12 11:57:20-04:00,1424 days 04:29:26.065614,28,1
...,...,...,...,...,...
273,wesley.goar@nationwidechildrens.org,2022-03-28 15:21:27-05:00,496 days 00:05:19.065614,18,1
274,yasasvini.puligundla@ga4gh.org,2021-09-24 17:00:21-04:00,680 days 23:26:25.065614,170,10
275,yeoldefortran@gmail.com,2014-12-05 15:47:57-05:00,3165 days 23:38:49.065614,9,1
276,ypriverol@gmail.com,2018-11-26 20:28:33+00:00,1713 days 23:58:13.065614,1,1


In [11]:
df_author.sort_values(by="num_commits", ascending=False).head(20)

Unnamed: 0,author_email,min_ts,age,num_commits,num_uniq_repos
127,jeremy.adams@ga4gh.org,2019-07-11 15:44:04-04:00,1487 days 00:42:42.065614,933,31
221,reecehart@gmail.com,2015-10-26 16:23:59-07:00,2840 days 21:02:47.065614,823,5
110,ian.fore@nih.gov,2020-08-11 09:27:24-04:00,1090 days 06:59:22.065614,560,4
14,Alex.Wagner@nationwidechildrens.org,2020-08-18 17:08:09-04:00,1082 days 23:18:37.065614,433,12
73,david@resium.com,2015-12-09 15:35:57-08:00,2796 days 20:50:49.065614,432,6
263,travis@travis-ci.org,2018-09-28 09:33:02+00:00,1773 days 10:53:44.065614,422,1
119,james.a.eddy@gmail.com,2018-02-08 12:47:46-08:00,2004 days 23:39:00.065614,392,6
72,danny.colligan@gmail.com,2015-01-15 09:36:58-08:00,3125 days 02:49:48.065614,355,6
130,jk@well.ox.ac.uk,2014-08-12 17:23:51+01:00,3281 days 04:02:55.065614,268,3
51,briandoconnor@gmail.com,2016-04-05 14:59:37-04:00,2679 days 01:27:09.065614,202,8
