---
title: Software Peer Review Over Time -- pyOpenSci
subtitle: pyOpenSci Peer Review Summary Stats
license:
  code: MIT
---



In [9]:
import os
import warnings
from datetime import datetime

import altair as alt
import pandas as pd
from IPython.display import display, HTML

from pyosmeta import ProcessIssues
from pyosmeta.github_api import GitHubAPI

pd.options.mode.chained_assignment = None
# pyArrow needs to be installed
pd.options.future.infer_string = True

# Suppress all warnings
warnings.filterwarnings('ignore')

# https://github.com/ryantam626/jupyterlab_code_formatter

In [10]:
# Load the Poppins font from Google Fonts
display(HTML('<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@400;700&display=swap" rel="stylesheet">'))

today = datetime.today().date().strftime("%d %B %Y")

# Load the Poppins font from Google Fonts
display(HTML('<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@400;700&display=swap" rel="stylesheet">'))

# Define the custom theme
def poppins_theme():
    return {
        'config': {
            # https://altair-viz.github.io/user_guide/configuration.html#config-title
            'title': {
                'fontSize': 20,
                'font': 'Poppins',
                'anchor': 'start',
                'color': 'black',
                'anchor': 'middle', # centered
                'fontWeight': 400,
                'dy': -20,
                'dx': 10
            },
            'axis': {
                'labelFontSize': 12,
                'titleFontSize': 14,
                'titleFont': 'Poppins',
                'labelFont': 'Poppins',
                'labelFontSize': 14, 
            },
            'legend': {
                'labelFontSize': 12,
                'titleFontSize': 14,
                'titleFont': 'Poppins',
                'labelFont': 'Poppins'
            },
            'bar': {
                'color': 'purple',
                'fill': 'purple'
            },
            'line': {
                'color': 'purple'
                },
            'view': {
                'height': 300,
                'width': 600,  # Default chart width
                'padding': {"top": 190, "left": 10, "right": 10, "bottom": 90}
            }
        }
    }

# Register the custom theme under a chosen name
alt.themes.register('poppins_theme', poppins_theme)

# Enable the newly registered theme
alt.themes.enable('poppins_theme')

ThemeRegistry.enable('poppins_theme')

Last updated: **{eval}`today`**

In [12]:
# Get a list of reviews submitted to us
# This potentially doesn't include issues that were deemed out of scope...
github_api = GitHubAPI(
    org="pyopensci",
    repo="software-submission",
    labels=[
        "0/seeking-editor",
        "0/pre-review-checks",
        "1/editor-assigned",
        "2/seeking-reviewers",
        "3/reviewers-assigned",
        "4/reviews-in-awaiting-changes",
        "5/awaiting-reviewer-response",
        "6/pyOS-approved",
        "7/under-joss-review",
        "8/joss-review-complete",
        "9/joss-approved",
        "New Submission!",
    ],
)
process_review = ProcessIssues(github_api)
issues = process_review.get_issues()
reviews, errors = process_review.parse_issues(issues)

In [13]:
# Get presubmissions
github_api_pre = GitHubAPI(
    org="pyopensci",
    repo="software-submission",
    labels=[
        "presubmission",
    ],
)
process_review_pre = ProcessIssues(github_api_pre)
issues_pre = process_review_pre.get_issues()
reviews_pre, errors = process_review_pre.parse_issues(issues_pre)

In [14]:
# All presubmissions over time
pre_submission_table = [{"package_name": name,
                "date_opened": review.created_at,
                "date_closed": review.closed_at,
                "labels": review.labels} 
                 for name, review in reviews_pre.items()]

pre_submission_reviews_df = pd.DataFrame(pre_submission_table)
total_presubmissions = len(pre_submission_reviews_df)

In [15]:
# Full reviews
review_table = [{"package_name": name,
                "date_opened": review.created_at,
                "date_closed": review.closed_at,
                "labels": review.labels} 
                 for name, review in reviews.items()]

reviews_df = pd.DataFrame(review_table)
total_submissions = len(reviews_df)

In [16]:
presub_count = pre_submission_reviews_df[pre_submission_reviews_df['date_closed'].isna()]
presub_open_count = len(presub_count)

open_reviews = reviews_df[reviews_df['date_closed'].isna()]
open_count = len(open_reviews)

In [17]:
# This should be 3 not 2
seeking_editor = open_reviews[open_reviews['labels'].apply(lambda x: "0/seeking-editor" in x)]
seeking_editor_count = len(seeking_editor)

# Create a current submissions bar plot
data = pd.DataFrame({
    'Category': ['Open Reviews', 'Pre-submission Inquiries'],
    'Count': [open_count, presub_open_count]
})

# Step 2: Create the bar plot
bar_chart = alt.Chart(data).mark_bar(color='purple').encode(
    x=alt.X('Category:N', title='Category', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Count:Q', title='Count'),
    tooltip=['Category', 'Count']
).properties(
    title=alt.TitleParams(text='Overview of Issues and Submissions'),
    width=600
).interactive()  # Makes the chart interactive

# Step 3: Display the chart
bar_chart.show()

We currently have {eval}`open_count` open reviews underway and {eval}`presub_open_count` open presubmissions.

## Other stats

* {eval}`seeking_editor_count` reviews are currently seeking an editor. This means that these Python packages have passed preliminary pre-review checks and are moving on to the review process. Finding an editor can sometimes take time when our editorial team is busy. 
* Of these XXX are on-hold <- query this next

In [18]:
# Should be 71 reviews and 56 presubmissions?
# Total presubmissions - get the total number of pre-submission inquiries (all time)
#Get presubmissions separately


# Total Presubmissions

Here we removed all issues that were help-wanted or issus with our templates that were not related to a software-review submission. As of today we have had 

* pyOpenSci has had **{eval}`total_submissions` total review submissions**


Below is a summary of total submissions per month since pyOpenSci started it's 
peer review process in 2019. We halted peer review in the fall 2022 given our 
executive director had funding to spend full time effort on the organization!

We reopened in the winter of 2023. That is where you see the increase in new 
submissions begin.


In [19]:
# Extract year and quarter from the 'date_opened' column
reviews_df.loc[:, "year_quarter"] = reviews_df["date_opened"].dt.to_period("Q")
quarterly_counts = (
    reviews_df["year_quarter"].value_counts().sort_index().reset_index()
)

quarterly_counts["year_quarter"] = quarterly_counts["year_quarter"].astype(str)

In [20]:
axis_labels = """datum.label == '2019Q1' ? '2019 Q1' : 
                datum.label == '2020Q1' ? '2020 Q1' : 
                datum.label == '2021Q1' ? '2021 Q1' : 
                datum.label == '2022Q1' ? '2022 Q1' :
                datum.label == '2023Q1' ? '2023 Q1' :
                datum.label == '2024Q1' ? '2024 Q1' :
                '' """
chart = (
    alt.Chart(quarterly_counts)
    .mark_bar(color="purple")
    .encode(
        x=alt.X(
            "year_quarter:O",
            title="Year-Quarter",
            axis=alt.Axis(
                labelAngle=0,
                labelExpr=axis_labels,
            ),
        ),
        y=alt.Y(
            "count:Q",
            title="Number of Submissions",
            axis=alt.Axis(tickCount=5),
        ),
        tooltip=[
            alt.Tooltip("year_quarter:O", title="Quarter"),
            alt.Tooltip("count:Q", title="Number of Issues"),
        ],
    )
    .properties(
        title="Number of Submissions by Quarter per Year",
        width=600,
        height=400,
    )
)

chart.show()

In [21]:
# Group issues by year and get counts
annual_issues = reviews_df.copy()

# Create a new column 'year' by extracting the year from the 'date_opened' column
annual_issues.loc[:, "year"] = annual_issues["date_opened"].dt.year

In [22]:
# Add year / month
annual_issues["year_month"] = annual_issues["date_opened"].dt.to_period("M")
counts_month_year = annual_issues.groupby("year_month").size().reset_index(name="count")

In [23]:
# Create a complete range of year_month periods
# Note i use this below - don't have to recalculate
all_month_years = pd.period_range(
    start=counts_month_year["year_month"].min(),
    end=counts_month_year["year_month"].max(),
    freq="M",
)

issues_by_year = (
    annual_issues.groupby("year")
    .size()
    .reset_index(name="count")
    .sort_values(by="year", ascending=False)
    .reset_index(drop=True)
)

Similarly you can see the growth in issues submitted to pyOpenSci thanks to 
both Sloan and CZI funding in the chart below. As of August 2024, we have 
almost surpassed total peer review submissions submitted in 2023. 

In [24]:
# Create an Altair bar chart
chart = (
    alt.Chart(issues_by_year)
    .mark_bar(color="purple")
    .encode(
        x=alt.X(
            "year:O",
            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=18),
            sort=alt.EncodingSortField(field="year", order="ascending"),
        ),
        y=alt.Y(
            "count:Q",
            axis=alt.Axis(labelFontSize=14, titleFontSize=18),
        ),
        tooltip=["year", "count"],
    )
    .properties(
        title=alt.TitleParams(
            text="pyOpenSci -- Number of Issues by Year", fontSize=24
        ),
        width=600,
    )
)

chart.show()

In [25]:
# Get fill in months with no issues with a value of 0
month_year_counts = (
    counts_month_year.set_index("year_month")
    .reindex(all_month_years, fill_value=0)
    .rename_axis("year_month")
    .reset_index()
)

# Summary: issues by month / year

Below you can see scientific Python peer review issues submitted by month since 2019. 

In [26]:
# Split year_month into separate year and month columns
month_year_counts["year"] = month_year_counts["year_month"].dt.year
month_year_counts["month"] = month_year_counts["year_month"].dt.strftime("%b")
month_year_counts["month_cat"] = pd.Categorical(
    month_year_counts["month"],
    categories=[
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ],
    ordered=True,
)
month_year_counts = month_year_counts.drop(columns=["year_month"])

## Peer review cadence -- slower months 

Historically since 2019 July and December  have been slower months. These could be ideal times for us to take a peer review breather, not accept new reviews, and catch up on business and documentation items. 

In [27]:
month_year_counts

# Summarize total counts per month
monthly_counts = (
    month_year_counts.groupby("month")["count"]
    .sum()
    .reindex(
        [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
        ],
        fill_value=0,
    )
    .reset_index()
)

# Create the Altair plot
chart = (
    alt.Chart(monthly_counts)
    .mark_bar(color="purple")
    .encode(
        x=alt.X(
            "month",
            sort=[
                "Jan",
                "Feb",
                "Mar",
                "Apr",
                "May",
                "Jun",
                "Jul",
                "Aug",
                "Sep",
                "Oct",
                "Nov",
                "Dec",
            ],
        ),
        y="count",
        tooltip=["month", "count"],
    )
    .properties(title="Total Counts per Month (2019-2024)", width=600, height=400)
    .configure_axis(labelAngle=0)
)

chart.show()

# Issues opened by month / year

# Number of Issues per Month Since 2019

Below is a cumulative sum representation of all of our peer review issues submitted to date. You can see that there is a significant uptick of issues submitted that began when we were able to utilize our funding and have a full time staff person (the Executive Director) onboard. 

In [28]:
# Set 'date_opened' column as index / add month and year cols for grouping
monthly_issues = reviews_df.copy()
monthly_issues["month"] = monthly_issues["date_opened"].dt.month
monthly_issues["year"] = monthly_issues["date_opened"].dt.year
# Get monthly counts
monthly_issues_index = monthly_issues.copy()

monthly_issues_index.set_index(
    monthly_issues_index["date_opened"].dt.to_period("M").dt.strftime("%Y-%m"),
    inplace=True,
)

# Group by the new index (month-year) and count the number of issues for each month-year
monthly_counts = monthly_issues_index.groupby(level=0).size()

In [29]:
# Create a df for every month/year combo in our dataset - this ensures a date for every
# month even if some months are missing
all_month_years = pd.date_range(
    start=monthly_issues.date_opened.min().strftime("%Y-%m"),
    end=monthly_issues.date_opened.max().strftime("%Y-%m"),
    freq="MS",
).to_period("M")

In [30]:
final_monthly = monthly_counts.copy()
# Ensure the index is of type periodIndex to support reindexing
final_monthly.index = pd.PeriodIndex(final_monthly.index, freq="M")
final_monthly = final_monthly.reindex(all_month_years, fill_value=0).to_frame(
    name="issue_count"
)

# Calculate cumulative sum of issue count
final_monthly["cumulative_count"] = final_monthly["issue_count"].cumsum()
final_monthly.reset_index(inplace=True, names="date")
final_monthly["date"] = final_monthly["date"].dt.to_timestamp()

In [41]:
# Create an Altair line plot
chart = (
    alt.Chart(final_monthly)
    .mark_line(color="purple", strokeWidth=8)
    .encode(
        x=alt.X(
            "date:T",
            axis=alt.Axis(
                title="Month",
                format="%b-%Y",
                tickCount="year",
            ),
        ),
        y=alt.Y(
            "cumulative_count:Q",
            axis=alt.Axis(
                title="Number of Issues",
                tickMinStep=50,
            ),
        ),
        tooltip=[
            alt.Tooltip("date:T", title="Month"),
            alt.Tooltip("cumulative_count:Q", title="Number of Issues"),
        ],
    )
    .properties(
        title=alt.TitleParams(
            text="Cumulative Review Submissions Over Time",
        ),
        width=600,
        height=400,
    )
)

label = (
    alt.Chart(pd.DataFrame({
        'date': [pd.Timestamp("2023-01-01")],  # Specific x-axis location (January 2023)
        'cumulative_count': [16], 
        'label': ["Full Time Funding"]
    }))
    .mark_text(align='left', dx=5, dy=-10, color='black', fontSize=12, font='Poppins')
    .encode(
        x='date:T',
        y='cumulative_count:Q',
        text='label:N'
    )
)

# Adding an arrow using mark_rule and mark_point
arrow = (
    alt.Chart(pd.DataFrame({
        'date': [pd.Timestamp("2023-01-01")],
        'cumulative_count': [20],  # Adjust y-axis location to position the arrow
    }))
    .mark_point(shape='triangle', angle=0, size=50, color='black')
    .encode(
        x='date:T',
        y='cumulative_count:Q'
    )
)

# Combine the chart and the label
final_chart = chart + label + arrow

# Show the final chart
final_chart.show()

# Show the chart
#chart.show()