---
title: Software Peer Review Over Time -- pyOpenSci
subtitle: pyOpenSci Peer Review Summary Stats
license:
  code: MIT
---

In [60]:
import warnings
from datetime import datetime

import altair as alt
import pandas as pd
from IPython.display import HTML, display
from pyosmeta import ProcessIssues
from pyosmeta.github_api import GitHubAPI




pd.options.mode.chained_assignment = None
pd.options.future.infer_string = True

# Suppress all warnings
warnings.filterwarnings("ignore")


def load_poppins_font():
    """Load the Poppins font from Google Fonts."""
    display(HTML('<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@400;700&display=swap" rel="stylesheet">'))

def poppins_theme():
    """Define the custom Poppins theme for Altair."""
    return {
        'config': {
            'title': {
                'fontSize': 20,
                'font': 'Poppins',
                'anchor': 'start',
                'color': 'black',
                'anchor': 'middle', # centered
                'fontWeight': 400,
                'dy': -20,
                'dx': 10
            },
            'axis': {
                'labelFontSize': 12,
                'titleFontSize': 14,
                'titleFont': 'Poppins',
                'labelFont': 'Poppins',
                'labelFontSize': 14, 
            },
            'axisX': {  # Configuration specifically for the x-axis
                'labelAngle': 0},
            'legend': {
                'labelFontSize': 12,
                'titleFontSize': 14,
                'titleFont': 'Poppins',
                'labelFont': 'Poppins'
            },
            'bar': {
                'color': 'purple',
                'fill': 'purple'
            },
            'line': {
                'color': 'purple'
            },
            'view': {
                'height': 300,
                'width': 600,  # Default chart width
                'padding': {"top": 190, "left": 10, "right": 10, "bottom": 90}
            }
        }
    }

def register_and_enable_poppins_theme():
    """Register and enable the Poppins theme in Altair."""
    alt.themes.register('poppins_theme', poppins_theme)
    alt.themes.enable('poppins_theme')



# https://github.com/ryantam626/jupyterlab_code_formatter

In [61]:
label_map = {
    "pre-review": ["New Submission!", "0/pre-review-checks"],
    "seeking editor": ["0/seeking-editor"],
    "under-review": [
        "1/editor-assigned",
        "2/seeking-reviewers",
        "3/reviewers-assigned",
        "4/reviews-in-awaiting-changes",
        "5/awaiting-reviewer-response",
    ],
    "pyos-accepted": ["6/pyos-approved"],
    "joss-accepted": ["9/joss-approved"],
}

issue_map = {
    "New Submission!": "pre-review",
    "0/pre-review-checks": "pre-review",
    "0/seeking-editor": "seeking editor",
    "1/editor-assigned": "under-review",
    "2/seeking-reviewers": "under-review",
    "3/reviewers-assigned": "under-review",
    "4/reviews-in-awaiting-changes": "under-review",
    "5/awaiting-reviewer-response": "under-review",
    "6/pyOS-approved": "pyos-accepted",
    "9/joss-approved": "joss-accepted",
}

In [62]:
# Register the dashboard altair theme

# Load the Poppins font
load_poppins_font()

# Register and enable the Poppins theme
register_and_enable_poppins_theme()


# TODO: This should end up in pyos meta!!
def set_review_status(labels, issue_map=issue_map):
    highest_label = None
    highest_value = -1

    # Check for special conditions
    if "currently-out-of-scope" in labels:
        return "out of scope"
    elif any(
        label in labels for label in ["⌛ pending-maintainer-response", "on-hold"]
    ):
        return "on hold"

    for i, label in enumerate(labels):
        if "/" not in label:
            continue

        value = int(label.split("/")[0])

        if value > highest_value:
            highest_label = labels[i]

    # If highest_label is set, map it; otherwise, default to 'pre-review'
    return issue_map.get(highest_label)


current_date = datetime.today().date()
today = current_date.strftime("%d %B %Y")  # Format: YYYY-MM-DD

Last updated: **{eval}`today`**

In [63]:
# Get a list of reviews submitted to us
# This potentially doesn't include issues that were deemed out of scope...
github_api = GitHubAPI(
    org="pyopensci",
    repo="software-submission",
    labels=[
        "0/seeking-editor",
        "0/pre-review-checks",
        "1/editor-assigned",
        "2/seeking-reviewers",
        "3/reviewers-assigned",
        "4/reviews-in-awaiting-changes",
        "5/awaiting-reviewer-response",
        "6/pyOS-approved",
        "7/under-joss-review",
        "8/joss-review-complete",
        "9/joss-approved",
        "New Submission!",
    ],
)
process_review = ProcessIssues(github_api)
issues = process_review.get_issues()
reviews, errors = process_review.parse_issues(issues)

In [64]:
# Get presubmissions
github_api_pre = GitHubAPI(
    org="pyopensci",
    repo="software-submission",
    labels=[
        "presubmission",
    ],
)
process_review_pre = ProcessIssues(github_api_pre)
issues_pre = process_review_pre.get_issues()
reviews_pre, errors = process_review_pre.parse_issues(issues_pre)

In [65]:
# All presubmissions over time
pre_submission_table = [
    {
        "package_name": name,
        "date_opened": review.created_at,
        "date_closed": review.closed_at,
        "labels": review.labels,
    }
    for name, review in reviews_pre.items()
]

pre_submission_reviews_df = pd.DataFrame(pre_submission_table)
total_presubmissions = len(pre_submission_reviews_df)

In [66]:
# All full reviews
review_table = [
    {
        "package_name": name,
        "date_opened": review.created_at,
        "date_closed": review.closed_at,
        "labels": review.labels,
    }
    for name, review in reviews.items()
]

reviews_df = pd.DataFrame(review_table)
total_submissions = len(reviews_df)

In [67]:
# TODO: check presubmission counts!
presub_count = pre_submission_reviews_df[
    pre_submission_reviews_df["date_closed"].isna()
]
presub_open_count = len(presub_count)



In [68]:
# TODO: 1 package is missing from under-review - that package is on hold 
# for index, row in reviews_df.iterrows():
#     print(f"PACKAGE: {row['package_name']}")  # Print the current label and row index
#     status = set_review_status(row['labels'])  # Call your function with the label
#     labels = row["labels"]
#     print(f"LABELS: {labels}")
#     print(f"STATUS: {status}")  # Print the resulting status
#     reviews_df.at[index, 'status'] = status  
# Calculate status on all reviews including accepted ones

reviews_df["status"] = reviews_df["labels"].apply(set_review_status)
review_status_ct = reviews_df["status"].value_counts().reset_index()
review_status_ct.rename(columns={"labels": "status"}, inplace=True)

review_status_ct

Unnamed: 0,status,count
0,pyos-accepted,18
1,joss-accepted,17
2,on hold,14
3,under-review,11
4,seeking editor,3
5,out of scope,2
6,pre-review,1


In [69]:
# This is less accurate because it grabs on hold issues
active_open_reviews = reviews_df[reviews_df["status"].isin(["seeking editor", 
                                                            "under-review", 
                                                            "pre-review"])]
open_reviews = active_open_reviews[active_open_reviews["date_closed"].isna()]
open_count = len(open_reviews)

## Current review status 

Below is an overview of Python package reviews that are currently open in the 
pyOpenSci peer review process. 

In [70]:
# TODO: double check counts
seeking_editor = open_reviews[
    open_reviews["labels"].apply(lambda x: "0/seeking-editor" in x)
]
seeking_editor_count = len(seeking_editor)

data = pd.DataFrame(
    {
        "Category": ["Open Reviews", "Pre-submission Inquiries"],
        "Count": [open_count, presub_open_count],
    }
)

bar_chart = (
    alt.Chart(data)
    .mark_bar(color="purple")
    .encode(
        x=alt.X("Category:N", title="Category", axis=alt.Axis(labelAngle=0)),
        y=alt.Y("Count:Q", title="Count"),
        tooltip=["Category", "Count"],
    )
    .properties(
        title=alt.TitleParams(text="pyOpenSci: Current Open Reviews"), width=600
    )
    .interactive()
)

bar_chart.show()

We currently have {eval}`open_count` packages in that are actively engaged 
our open software peer review process and 
{eval}`presub_open_count` open Python package peer review pre-submission 
requests.

In [71]:
# Set review status

## Current review status 

The plot below shows the status of our current open and closed reviews. 

A few of these numbers are a bit off (or out of sync with our project board)
But the numbers right now are close!

In [72]:
chart = (
    alt.Chart(review_status_ct)
    .mark_bar()
    .encode(
        x=alt.X(
            "status",
            title="Review Status",
            sort=[
                "pre-review",
                "seeking editor",
                "under-review",
                "pyos-accepted",
                "joss-accepted",
                "on-hold",
                "out-of-scope",
            ],
        ),
        y=alt.Y(
            "count",
            axis=alt.Axis(tickCount=5),
            title="Count",
            scale=alt.Scale(domain=[0, 20]),
        ),
        tooltip=[
            alt.Tooltip("status:N", title="Status"),
            alt.Tooltip("count:Q", title="Count"),
        ],
    )
    .properties(title="Count of Packages by Status", width="container")
)

# Display the chart
chart.show()

In [73]:
# This calculates status on currently open reviews
open_reviews["status"] = open_reviews["labels"].apply(set_review_status)
status_counts = open_reviews["status"].value_counts().reset_index()

In [74]:
chart = (
    alt.Chart(status_counts)
    .mark_bar()
    .encode(
        x=alt.X("status"),
        y="count",
        tooltip=[
            alt.Tooltip("status:N", title="Status"),
            alt.Tooltip("count:Q", title="Count"),
        ],
    )
    .properties(title="Review Status for Current Reviews", width="container")
)

# Display the chart
chart.show()

In [75]:
# Should be 71 reviews and 56 presubmissions?
# Total presubmissions - get the total number of pre-submission inquiries (all time)
# Get presubmissions separately

# Total Presubmissions

Here we removed all issues that were help-wanted or issus with our templates that were not related to a software-review submission. As of today we have had 

* pyOpenSci has had **{eval}`total_submissions` total review submissions**


Below is a summary of total submissions per month since pyOpenSci started it's 
peer review process in 2019. We halted peer review in the fall 2022 given our 
executive director had funding to spend full time effort on the organization!

We reopened in the winter of 2023. That is where you see the increase in new 
submissions begin.


In [76]:
# Extract year and quarter from the 'date_opened' column
reviews_df.loc[:, "year_quarter"] = reviews_df["date_opened"].dt.to_period("Q")
quarterly_counts = reviews_df["year_quarter"].value_counts().sort_index().reset_index()

quarterly_counts["year_quarter"] = quarterly_counts["year_quarter"].astype(str)

In [77]:
axis_labels = """datum.label == '2019Q1' ? '2019 Q1' : 
                datum.label == '2020Q1' ? '2020 Q1' : 
                datum.label == '2021Q1' ? '2021 Q1' : 
                datum.label == '2022Q1' ? '2022 Q1' :
                datum.label == '2023Q1' ? '2023 Q1' :
                datum.label == '2024Q1' ? '2024 Q1' :
                '' """
chart = (
    alt.Chart(quarterly_counts)
    .mark_bar(color="purple")
    .encode(
        x=alt.X(
            "year_quarter:O",
            title="Year-Quarter",
            axis=alt.Axis(
                labelAngle=0,
                labelExpr=axis_labels,
            ),
        ),
        y=alt.Y(
            "count:Q",
            title="Number of Submissions",
            axis=alt.Axis(tickCount=5),
        ),
        tooltip=[
            alt.Tooltip("year_quarter:O", title="Quarter"),
            alt.Tooltip("count:Q", title="Number of Issues"),
        ],
    )
    .properties(
        title="Number of Submissions by Quarter per Year",
        width="container",
        height=400,
    )
)

chart.show()

In [78]:
# Group issues by year and get counts
annual_issues = reviews_df.copy()

# Create a new column 'year' by extracting the year from the 'date_opened' column
annual_issues.loc[:, "year"] = annual_issues["date_opened"].dt.year

In [79]:
# Add year / month
annual_issues["year_month"] = annual_issues["date_opened"].dt.to_period("M")
counts_month_year = annual_issues.groupby("year_month").size().reset_index(name="count")

In [80]:
# Create a complete range of year_month periods
# Note i use this below - don't have to recalculate
all_month_years = pd.period_range(
    start=counts_month_year["year_month"].min(),
    end=counts_month_year["year_month"].max(),
    freq="M",
)

issues_by_year = (
    annual_issues.groupby("year")
    .size()
    .reset_index(name="count")
    .sort_values(by="year", ascending=False)
    .reset_index(drop=True)
)

Similarly you can see the growth in issues submitted to pyOpenSci thanks to 
both Sloan and CZI funding in the chart below. As of August 2024, we have 
almost surpassed total peer review submissions submitted in 2023. 

In [81]:
# Create an Altair bar chart
chart = (
    alt.Chart(issues_by_year)
    .mark_bar(color="purple")
    .encode(
        x=alt.X(
            "year:O",
            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=18),
            sort=alt.EncodingSortField(field="year", order="ascending"),
        ),
        y=alt.Y(
            "count:Q",
            axis=alt.Axis(labelFontSize=14, titleFontSize=18),
        ),
        tooltip=["year", "count"],
    )
    .properties(
        title=alt.TitleParams(
            text="pyOpenSci -- Number of Issues by Year", fontSize=24
        ),
        width=600,
    )
)

chart.show()

In [82]:
# Get fill in months with no issues with a value of 0
month_year_counts = (
    counts_month_year.set_index("year_month")
    .reindex(all_month_years, fill_value=0)
    .rename_axis("year_month")
    .reset_index()
)

# Summary: issues by month / year

Below you can see scientific Python peer review issues submitted by month since 2019. 

In [83]:
# Split year_month into separate year and month columns
month_year_counts["year"] = month_year_counts["year_month"].dt.year
month_year_counts["month"] = month_year_counts["year_month"].dt.strftime("%b")
month_year_counts["month_cat"] = pd.Categorical(
    month_year_counts["month"],
    categories=[
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ],
    ordered=True,
)
month_year_counts = month_year_counts.drop(columns=["year_month"])

## Peer review cadence -- slower months 

Historically since 2019 July and December  have been slower months. These could be ideal times for us to take a peer review breather, not accept new reviews, and catch up on business and documentation items. 

In [84]:
month_year_counts

# Summarize total counts per month
monthly_counts = (
    month_year_counts.groupby("month")["count"]
    .sum()
    .reindex(
        [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
        ],
        fill_value=0,
    )
    .reset_index()
)

# Create the Altair plot
chart = (
    alt.Chart(monthly_counts)
    .mark_bar()
    .encode(
        x=alt.X(
            "month",
            sort=[
                "Jan",
                "Feb",
                "Mar",
                "Apr",
                "May",
                "Jun",
                "Jul",
                "Aug",
                "Sep",
                "Oct",
                "Nov",
                "Dec",
            ],
        ),
        y="count",
        tooltip=["month", "count"],
    )
    .properties(title="Total Counts per Month (2019-2024)", width="container", height=400)
    .configure_axis(labelAngle=0)
)

chart.show()

# Issues opened by month / year

# Number of Issues per Month Since 2019

Below is a cumulative sum representation of all of our peer review issues submitted to date. You can see that there is a significant uptick of issues submitted that began when we were able to utilize our funding and have a full time staff person (the Executive Director) onboard. 

In [85]:
# Set 'date_opened' column as index / add month and year cols for grouping
monthly_issues = reviews_df.copy()
monthly_issues["month"] = monthly_issues["date_opened"].dt.month
monthly_issues["year"] = monthly_issues["date_opened"].dt.year
# Get monthly counts
monthly_issues_index = monthly_issues.copy()

monthly_issues_index.set_index(
    monthly_issues_index["date_opened"].dt.to_period("M").dt.strftime("%Y-%m"),
    inplace=True,
)

# Group by the new index (month-year) and count the number of issues for each month-year
monthly_counts = monthly_issues_index.groupby(level=0).size()

In [86]:
# Create a df for every month/year combo in our dataset - this ensures a date for every
# month even if some months are missing
all_month_years = pd.date_range(
    start=monthly_issues.date_opened.min().strftime("%Y-%m"),
    end=monthly_issues.date_opened.max().strftime("%Y-%m"),
    freq="MS",
).to_period("M")

In [87]:
final_monthly = monthly_counts.copy()
# Ensure the index is of type periodIndex to support reindexing
final_monthly.index = pd.PeriodIndex(final_monthly.index, freq="M")
final_monthly = final_monthly.reindex(all_month_years, fill_value=0).to_frame(
    name="issue_count"
)

# Calculate cumulative sum of issue count
final_monthly["cumulative_count"] = final_monthly["issue_count"].cumsum()
final_monthly.reset_index(inplace=True, names="date")
final_monthly["date"] = final_monthly["date"].dt.to_timestamp()

In [88]:
# Create an Altair line plot
chart = (
    alt.Chart(final_monthly)
    .mark_line(color="purple", strokeWidth=8)
    .encode(
        x=alt.X(
            "date:T",
            axis=alt.Axis(
                title="Month",
                format="%b-%Y",
                tickCount="year",
            ),
        ),
        y=alt.Y(
            "cumulative_count:Q",
            axis=alt.Axis(
                title="Number of Issues",
                tickMinStep=50,
            ),
        ),
        tooltip=[
            alt.Tooltip("date:T", title="Month"),
            alt.Tooltip("cumulative_count:Q", title="Number of Issues"),
        ],
    )
    .properties(
        title=alt.TitleParams(
            text="Cumulative Review Submissions Over Time",
        ),
        width=600,
        height=400,
    )
)

label = (
    alt.Chart(
        pd.DataFrame(
            {
                "date": [
                    pd.Timestamp("2023-01-01")
                ],  # Specific x-axis location (January 2023)
                "cumulative_count": [16],
                "label": ["Full Time Funding"],
            }
        )
    )
    .mark_text(align="left", dx=5, dy=-10, color="black", fontSize=12, font="Poppins")
    .encode(x="date:T", y="cumulative_count:Q", text="label:N")
)

# Adding an arrow using mark_rule and mark_point
arrow = (
    alt.Chart(
        pd.DataFrame(
            {
                "date": [pd.Timestamp("2023-01-01")],
                "cumulative_count": [
                    20
                ],  # Adjust y-axis location to position the arrow
            }
        )
    )
    .mark_point(shape="triangle", angle=0, size=50, color="black")
    .encode(x="date:T", y="cumulative_count:Q")
)

# Combine the chart and the label
final_chart = chart + label + arrow

# Show the final chart
final_chart.show()

# Show the chart
# chart.show()