In [1]:
import pandas as pd
import boost_histogram as bh
import numpy as np

In [2]:
skhep = pd.read_csv("scikit-hep-20180101-20250101.csv", usecols=["timestamp", "file_project", "details_python"], parse_dates=["timestamp"], dtype={"file_project": "category", "details_python": "category"})
skhep.dropna(inplace=True) # Drop 11 NA's for Python version

## Classic method

In [3]:
def compute_pandas(skhep, *projects: str, year: int):
    start=f"{year}-01-01"
    end=f"{year}-12-31"
    results = {}
    for project in projects:
        val = skhep[skhep.file_project == project]
        val = val[(val.timestamp > start) & (val.timestamp < end)]
        vers = val.details_python.str[0]
        results[project] = (sum(vers == '2'), sum(vers == '3'))
    return results

In [4]:
%%time
results2020 = compute_pandas(skhep, *set(skhep.file_project), year=2020)
results2019 = compute_pandas(skhep, *set(skhep.file_project), year=2019)
results2018 = compute_pandas(skhep, *set(skhep.file_project), year=2018)

CPU times: user 2.83 s, sys: 283 ms, total: 3.11 s
Wall time: 3.12 s


## Boost Histogram method

In [5]:
def compute_bh(hist, *projects: str, year: int):
    results = {}
    for project in projects:
        ver2 = hist[bh.loc(project), bh.loc(year), bh.loc(2)]
        ver3 = hist[bh.loc(project), bh.loc(year), bh.loc(3)]
        results[project] = (ver2, ver3)
    return results

In [6]:
hist = bh.Histogram(
    bh.axis.StrCategory(list(set(skhep.file_project))),
    bh.axis.Integer(2018, 2021, underflow=False, overflow=False),
    bh.axis.Integer(2, 4, underflow=False, overflow=False),
    storage=bh.storage.Int64()
)

In [7]:
%%time
_ = hist.fill(np.asarray(skhep.file_project, dtype=str),
          np.asarray(skhep.timestamp.dt.year),
          np.asarray(skhep.details_python.str[0].astype(int)))

CPU times: user 1.57 s, sys: 186 ms, total: 1.76 s
Wall time: 1.76 s


In [8]:
%%time
results2020 = compute_bh(hist, *set(skhep.file_project), year=2020)
results2019 = compute_bh(hist, *set(skhep.file_project), year=2019)
results2018 = compute_bh(hist, *set(skhep.file_project), year=2018)

CPU times: user 441 ms, sys: 34.1 ms, total: 475 ms
Wall time: 474 ms


## Make table (from either result set)

In [9]:
def show(*args):
    results = args[0]
    keys = filter(lambda x: sum(results[x]) > 100, results)
    keys = sorted(keys, key=lambda x: results[x][0]/sum(results[x]))
    for project in keys:
        print(f"| {project:20} |", end="")
        for extra in args:
            res = ""
            if project in extra:
                v2, v3 = extra[project]
                tot = (v2 + v3) / 1000
                if tot >= 0.1:
                    frac = v2 / (v2 + v3)
                    x = 1 if tot < 10 else 0
                    y = 1 if frac < 0.1 else 0
                    res = f"{frac:5.{y}%} of {tot:4.{x}f}k"
            print(f" {res:14} |", end="")
        print("")

In [10]:
print("| Package name         | 2020 (partial) | 2019           | 2018           |")
print("|----------------------|----------------|----------------|----------------|")
show(results2020, results2019, results2018)

| Package name         | 2020 (partial) | 2019           | 2018           |
|----------------------|----------------|----------------|----------------|
| scikit-optimize      |  0.2% of 2812k |  2.5% of 1349k |   13% of   85k |
| mplhep               |  0.2% of  111k |  6.2% of  4.7k |                |
| formulate            |  0.9% of  1.2k |   18% of  0.5k |                |
| pyhf                 |  1.1% of  6.8k |   13% of  1.6k |                |
| scikit-hep-testdata  |  1.4% of  4.6k |  2.4% of  3.5k |                |
| pyjet                |  1.5% of  4.6k |   23% of  2.1k |   44% of  0.2k |
| vegascope            |  2.8% of  0.7k |  1.3% of  5.7k |   48% of  0.1k |
| awkward1             |  2.8% of  4.4k |   15% of  0.6k |                |
| reana-client         |  4.0% of  1.4k |   42% of  0.2k |   48% of  0.1k |
| aghast               |  6.3% of  0.1k |   26% of  0.4k |                |
| boost-histogram      |  7.4% of  1.3k |   19% of  0.3k |                |
| iminuit   