## Rate of change

### Last modified?



In [11]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths, chart_dir

documents = pd.read_csv(pgp_csv_paths["documents"])

In [12]:
# parse initial entry as date time
documents['last_modified'] = pd.to_datetime(documents['last_modified'], format='ISO8601')

In [13]:
docs_modified = documents.groupby(pd.Grouper(key="last_modified", freq="W-MON"))[['pgpid']].count().reset_index().rename(columns={'pgpid': 'total'})
docs_modified.head()

Unnamed: 0,last_modified,total
0,2021-05-31 00:00:00+00:00,8034
1,2021-06-07 00:00:00+00:00,3
2,2021-06-14 00:00:00+00:00,49
3,2021-06-21 00:00:00+00:00,20
4,2021-06-28 00:00:00+00:00,44


In [14]:
# TODO rename total/cumulative total so they make sense here
docs_modified_chart = alt.Chart(docs_modified).mark_area().encode(
    y=alt.Y('total', title="Documents").scale(domain=[0, 800], clamp=True),
    x=alt.X('last_modified', title="Last Modified", axis=alt.Axis(format="%Y", tickCount="year")),
    # color=alt.Color("status", title="").scale(scheme="tableau20"),
).properties(
    width=900,
    height=200
)

docs_modified_chart.save(f'{chart_dir}/docs_last_modified.pdf')
docs_modified_chart

### Stats from GitHub versions

PGP metadata is backed up to a GitHub repository. We can use the repository history to assess the rate of change.


In [15]:
dataset_stats = pd.read_csv("../dataset-history/pgp-dataset-history.csv")

dataset_stats.head()

Unnamed: 0,date,type,count
0,2025-04-10,documents,35090
1,2025-04-09,documents,35090
2,2025-04-08,documents,35078
3,2025-04-07,documents,35079
4,2025-04-06,documents,35076


In [16]:
doc_frag_totals = dataset_stats[dataset_stats.type.isin(['documents', 'fragments', 'fragment_images'])]

totals_by_time = alt.Chart(doc_frag_totals).mark_line().encode(
    y=alt.Y('count', title="Total").scale(domain=[15000, 40000]),
    x=alt.X('date:T', title="Date", axis=alt.Axis(format="%Y", tickCount="year")),
    color=alt.Color("type", title="").scale(scheme="tableau10"),
).properties(
    width=900,
    height=200
# can't configure here if we want to combine
# ).configure_legend(
#     strokeColor='gray',
#     fillColor='white', #EEEEEE',
#     padding=15,
#     cornerRadius=5,
#     # orient='top-left',
#     orient='none',
#     legendX=20, legendY=80,
)
totals_by_time

In [17]:
text_totals = dataset_stats[dataset_stats.type.isin(['transcriptions', 'translations'])]

text_totals_by_time = alt.Chart(text_totals).mark_line().encode(
    y=alt.Y('count', title="Total"),
    x=alt.X('date:T', title="Date", axis=alt.Axis(format="%Y", tickCount="year")),
    color=alt.Color("type", title="") # .scale(scheme="tableau10", reverse=True),
).properties(
    width=900,
    height=170
)
# .configure_legend(
#     strokeColor='gray',
#     fillColor='white', #EEEEEE',
#     padding=15,
#     cornerRadius=5,
#     # orient='top-left',
#     orient='none',
#     legendX=20, legendY=80,
# )
text_totals_by_time

In [18]:
combined_totals = (totals_by_time & text_totals_by_time).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=15,
    cornerRadius=5,
    # orient='top-left',
    orient='none',
    legendX=750, legendY=50,
)

combined_totals.save(f'{chart_dir}/combined_totals_historic.pdf')
combined_totals