# PGP Dataset essay

This notebook generates charts and numbers based on PGP datasets for the essay about the data.

In [None]:
# for convenience, a copy of the 1.0 data is included in this repository
# path is relative to the notebook file
data_path = "../data/"

# To run against latest pgp metadata, load by URL from pgp-metadata repo
#data_path = "https://github.com/princetongenizalab/pgp-metadata/raw/main/data/"
pgp_csvs = {
    "documents": f"{data_path}documents.csv",
    "fragments": f"{data_path}fragments.csv",
    "sources": f"{data_path}sources.csv",
    "footnotes": f"{data_path}footnotes.csv"
}

## Documents and Fragments

In [None]:
import pandas as pd

documents = pd.read_csv(pgp_csvs["documents"])

In [None]:
# create a dict to collect numbers for inclusion in the dataset essay
essay_numbers = {}

In [None]:
total_documents = documents.shape[0]
essay_numbers["totalDocuments"] = f"{total_documents:,}"
print(f"{total_documents:,} total documents")

### Documents by number of fragments

How many fragments are typically used to compose a single document?

The `shelfmark` field in the data is a composite field; a " + " is used to indicate multiple shelfmarks.

NOTE: this does not account for shelfmark overrides, e.g. ranges of shelfmarks.

In [None]:
documents_fragments = documents.copy()

documents_fragments["fragments"] = documents_fragments.shelfmark.apply(lambda x : x.split(' + '))
documents_fragments["num_fragments"] = documents_fragments.fragments.apply(lambda x : len(x))
documents_fragments.num_fragments.value_counts().head(10)

What percent of documents occur on a single fragment?

In [None]:
single_frag_docs = documents_fragments[documents_fragments.num_fragments == 1].shape[0]
essay_numbers["singleFragmentDocuments"] = f"{single_frag_docs:,}"
single_frag_docs

In [None]:
joins = documents_fragments[documents_fragments.num_fragments > 1].shape[0]
essay_numbers["totalJoins"] = f"{joins:,}"
print(f"{joins:,} total joins")

What are some examples of documents that span more than 6 fragments?

In [None]:
documents_fragments[documents_fragments.num_fragments > 6][['pgpid', 'shelfmark', 'type', 'description', ]].head(10)

In [None]:
# aggregate documents by the number of fragments they span, for graphing
docs_per_num_fragments = documents_fragments[['num_fragments', 'pgpid']].groupby('num_fragments').count().reset_index().rename(columns={"pgpid": "count"})
# calculate a percentage of the total
docs_per_num_fragments["percent"] = docs_per_num_fragments["count"].apply(lambda x : f"{(x / total_documents):.2%}")

docs_per_num_fragments.head(10)

What does the long tail look like, of documents spanning a large number of fragments?

In [None]:
# TODO: think about whether this can be condensed into a meaniengful table to display alongside the chart
docs_per_num_fragments.tail(10)

In [None]:
# group for display in a table
# - get values for all documents on fewer than 7 fragments
docs_per_nfrags = docs_per_num_fragments[docs_per_num_fragments.num_fragments < 7]
# - aggregate values for 7+
docs_manyfrags = docs_per_num_fragments[docs_per_num_fragments.num_fragments > 7]
total_docs_7plusfrags = docs_manyfrags["count"].sum()
pct_docs_7plusfrags =  f"{(total_docs_7plusfrags / total_documents):.2%}"

agg_values = {"num_fragments": f"{docs_manyfrags.num_fragments.min()}-{docs_manyfrags.num_fragments.max()}", "count": total_docs_7plusfrags, "percent": pct_docs_7plusfrags}
docs_per_nfrags = docs_per_nfrags._append(agg_values, ignore_index=True)
docs_per_nfrags

In [None]:
print(docs_per_nfrags.to_latex(index=False))

In [None]:
import altair as alt

# chart directory, relative to notebook file
chart_dir = "../charts"

docs_frags_chart = alt.Chart(docs_per_num_fragments).mark_bar().encode(
    x=alt.X('num_fragments', title='Number of fragments'),
    y=alt.Y('count', title='Documents').scale(type="symlog")
).properties(
    # title='Documents by number of fragments',
    width=400
)
docs_frags_chart.save(f'{chart_dir}/documents_per_num_frags.pdf')
docs_frags_chart

### Fragments by number of documents

If we use fragments as our starting point, what is the distribution? How many documents are typically written on a fragment, and are there similar outliers?

In [None]:
fragments = pd.read_csv(pgp_csvs["fragments"])
total_fragments = fragments.shape[0]

print(f"{total_fragments:,} fragments")

This data export has duplicates! How many?

In [None]:
print(f"{fragments[fragments.duplicated()].shape[0]:,} duplicates")

In [None]:
# drop duplicates for now (but should fix in the data export)

uniq_fragments = fragments.drop_duplicates()
total_uniq_fragments = uniq_fragments.shape[0]

print(f"{total_uniq_fragments:,} fragments")

The fragments dataset includes a delimited list of associated PGPIDs; we can use that to determine how many documents are associated with a fragment.

In [None]:
uniq_fragments["num_documents"] = uniq_fragments.pgpids.apply(lambda x : len(x.split(';')))
uniq_fragments.num_documents.value_counts()

What are some examples of fragments with a large number of documents?

In [None]:
uniq_fragments[uniq_fragments.num_documents > 6][['shelfmark', 'pgpids', 'num_documents']].head(10)

In [None]:
# group by number of documents to aggregrate for plotting
frags_per_num_documents = uniq_fragments[['num_documents', 'shelfmark']].groupby('num_documents').count().reset_index().rename(columns={"shelfmark": "fragments"})
# calculate percenatage of total fragements
frags_per_num_documents["percent"] = frags_per_num_documents.fragments.apply(lambda x : (x / total_fragments) * 100)
frags_per_num_documents

In [None]:
# chart with log scale
frags_docs_chart = alt.Chart(frags_per_num_documents).mark_bar().encode(
    x=alt.X('num_documents', title='Number of Documents', scale=alt.Scale(zero=False)),
    y=alt.Y('fragments', title='Fragments', scale=alt.Scale(type="symlog"))
).properties(
#    title="Fragments by number of documents",
    width=400
)
frags_docs_chart.save(f'{chart_dir}/fragments_with_num_docs.pdf')
frags_docs_chart

In [None]:

# combine horizontally for display in pdf
(docs_frags_chart | frags_docs_chart).save(f'{chart_dir}/docs_frags_2up.pdf')
(docs_frags_chart | frags_docs_chart)

## Number of documents over time

In [None]:
# parse initial entry as date time
documents['initial_entry'] = pd.to_datetime(documents['initial_entry'], format='mixed')


#documents['pgpid'].groupby(documents['initial_entry'].dt.to_period('Y')).sum().plot(kind='bar')
newdocs_per_year = documents['pgpid'].groupby(documents['initial_entry'].dt.to_period('Y')).count().reset_index().rename(columns={'pgpid': 'total'})
# convert time period to timestamp
newdocs_per_year["initial_entry"] = newdocs_per_year["initial_entry"].dt.to_timestamp()
newdocs_per_year.head(10)

In [None]:
# calculate cumulative total
newdocs_per_year['cumulative_total'] = newdocs_per_year['total'].cumsum()
newdocs_per_year.head()

In [None]:
# use melt to revise so we can plot both together
newdocs_per_year_melted = pd.melt(newdocs_per_year.rename(columns={'initial_entry': 'date', 'total': 'New documents', 'cumulative_total': 'Cumulative total'}), id_vars=['date'], value_vars=['New documents', 'Cumulative total'])
newdocs_per_year_melted = newdocs_per_year_melted.rename(columns={"variable": "status", "value": "Documents"})
newdocs_per_year_melted.head()

In [None]:
# TODO rename total/cumulative total so they make sense here
docs_overtime_chart = alt.Chart(newdocs_per_year_melted).mark_area().encode(
    y=alt.Y('Documents'),
    x=alt.Y('date', title="Year"),
    color=alt.Color("status", title="").scale(scheme="tableau20"),
).properties(
#    title="Documents in PGP per year with cumulative total",
    width=800,
    height=250

).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=15,
    cornerRadius=5,
    orient='top-left'
)
docs_overtime_chart.save(f'{chart_dir}/docs_over_time.pdf')

docs_overtime_chart

## Metadata fields available

To what extent are metadata fields filled out?


In [None]:
# calculate what percent of records have which field (code adapted from S&co dataset notebook)
def percent_known(df, field):
  total = len(df)
  with_field = len(df[df[field].notnull()])
  print('%d with %s : %.00f%%' % (with_field, field, (with_field/total)*100))
  return with_field

In [None]:
total_known = {}
metadata_fields = ['type', 'description', 'tags', 'languages_primary', 'languages_secondary', 'doc_date_original', 'doc_date_standard']

for field in metadata_fields:
    total_known[field] = percent_known(documents, field)

print(total_known)

In [None]:
# are there any records where secondary language is known but primary is not?
documents[documents.languages_primary.isna() & documents.languages_secondary.notna()]

In [None]:
# specify keys as index to preserve field order as specified above
known = pd.DataFrame({'total': total_known}, index=total_known.keys())
known['percent'] = (known['total'] / total_documents*100)
# known['order'] = known.loc.apply(lambda x: metadata_fields.index(x))
known.percent = known.percent.round(2)
known

In [None]:

total_documents = len(documents.index)
known_unknown = pd.DataFrame()
known_unknown['known'] = known['total']
known_unknown['unknown'] = known.total.apply(lambda x: total_documents - x)
known_unknown

In [None]:
# reshape for plotting
known_unknown_melted = known_unknown.reset_index().melt(id_vars="index", value_vars=["known", "unknown"])

known_unknown_melted = known_unknown_melted.rename(columns={"index": "field", "variable": "status", "value": "documents"})
known_unknown_melted.head(10)

In [None]:
field_order = ["type", "description", "tags", "languages_primary", "languages_secondary", "doc_date_original", "doc_date_standard"]

known_melted = known_unknown_melted[known_unknown_melted.status == "known"]

metadata_chart = alt.Chart(known_melted).mark_bar().encode(
    x=alt.X("documents", title="Documents with data populated").scale(domain=[0, total_documents]),
    y=alt.Y("field", title="Metadata field").scale(domain=field_order),
    # color=alt.Color("status").scale(domain=["known", "unknown"], range=["#7bac7b", "lightgray"]),
).properties(width=650, height=200)

metadata_chart

In [None]:
known_melted['label'] = known_melted.documents.apply(lambda x: f'{x:,} ({x/total_documents:.0%})')

# text labels for metadata chart
text = alt.Chart(known_melted).mark_text(
     dy=0,
     color=alt.expr(alt.expr.if_(alt.datum.documents > 5000, "white", "black")),
     align=alt.expr(alt.expr.if_(alt.datum.documents > 5000, "right", "left")),
     dx=alt.expr(alt.expr.if_(alt.datum.documents > 5000, -10, 5))
  ).encode(
    x=alt.X('documents').scale(domain=[0, total_documents]),
    y=alt.Y("field", title="Metadata field").scale(domain=field_order),
    text=alt.Text('label')
)

(metadata_chart + text).save(f"{chart_dir}/metadata_available.pdf")
metadata_chart + text

## Document types

In [None]:
# copy dataframe and replace unset type with "Unknown" so we can include it in reported type totals
docs_with_unknowns = documents.copy()
docs_with_unknowns["type"] = docs_with_unknowns.type.fillna("Unknown")

In [None]:
doctype_totals = docs_with_unknowns.type.value_counts()
doctype_totals
#35,277 documents in PGP, and the most common types are Letters (11,122) and Legal Documents (8,149). The current dataset includes 4,351 documents with no type, which may me

In [None]:
import re

def texOutputLabel(term):
  # convert a variable name or string into format for tex variable output
  # remove numbers and dashes
  term = re.sub(r'[-\d]', '', term)
  # remove spaces, and use title/camel case
  return ''.join([t.title() for t in term.split()])

In [None]:
print(f"{total_documents:,} total documents\n")
for doctype, total in doctype_totals.to_dict().items():
  # add to output numbers for essay
  print(f"{doctype}:\t{total:,}")
  # prefix with total and convert for output
  outputLabel = f"total{texOutputLabel(doctype)}"
  essay_numbers[outputLabel] = f"{total:,}"

In [None]:
doctype_counts = doctype_totals.reset_index()

doctype_chart = alt.Chart(doctype_counts).mark_bar().encode(
    y=alt.Y('type', title="Document Type"),
    x=alt.X('count', title="Number of documents")
).properties(
    # title="Documents by type",
    width=600
)
doctype_chart.save(f"{chart_dir}/document_types.pdf")
doctype_chart

### Document descriptions

What is the typical length of document descriptions?

In [None]:
documents['description_num_chars'] = documents.description.apply(lambda x: len(x.strip()) if pd.notna(x) else 0)
documents['description_num_tokens'] = documents.description.apply(lambda x: len(x.strip().split()) if pd.notna(x) else 0)
documents[['pgpid', 'description', 'description_num_chars', 'description_num_tokens']].head()

In [None]:
documents[documents.description_num_chars == 0]

In [None]:
documents.description_num_chars.describe()

In [None]:
documents.description_num_tokens.describe()

In [None]:
# disable altair max row check
alt.data_transformers.disable_max_rows()

alt.Chart(documents).mark_bar().encode(
    alt.X("description_num_chars").bin(maxbins=12),
    y='count()',
)

In [None]:
alt.Chart(documents).mark_bar().encode(
    alt.X("description_num_tokens").bin(maxbins=12),
    y='count()',
)

In [None]:
# langdetect is not useful here
# could potentially analyze based on unicode script range
# i.e., what percentage have hebrew / arabic terms (usually names?)
# but this is probably not important

## Tags

Summary information about tags. How many tags, how much reuse/variation; how densely or sparsely are documents tagged?

In [None]:
documents.tags

In [None]:
documents['num_tags'] = documents.tags.apply(lambda x: len(x.strip().split(", ")) if pd.notna(x) else 0)
documents[['pgpid', 'tags', 'num_tags']].head(10)

In [None]:
document_tag_counts = documents.num_tags.value_counts()
document_tag_counts

In [None]:
alt.Chart(document_tag_counts.reset_index()).mark_bar().encode(
    alt.X("num_tags", title="# Tags").scale(domain=[0, 17]), # .bin(),
    y=alt.Y('count', title="# Documents"),
).properties(title="Tag frequency by document")

In [None]:
# split the tag list and explode out into document - tag pairs for analysis

documents['tag_list'] = documents.tags.apply(lambda x: x.strip().split(", ") if pd.notna(x) else [])

docs_tags = documents[['pgpid', 'tag_list']].explode('tag_list').rename(columns={'tag_list': 'tag'})
# filter out unset tags
docs_tags = docs_tags[docs_tags.tag.notna()]
docs_tags.head(10)

How many unique tags?

In [None]:
essay_numbers['uniqueTags'] = f"{len(docs_tags.tag.unique()):,}"
print(f"{len(docs_tags.tag.unique()):,} unique tags")

What are the most common tags?

In [None]:
tag_counts = docs_tags.tag.value_counts().reset_index()

tag_counts.head(15)

In [None]:
# add top 7 most common tags to output numbers for inclusion in essay
for tagcount in tag_counts.head(6).itertuples():
  essay_numbers[f'tagged{texOutputLabel(tagcount.tag)}'] = tagcount.count

How many tags are used only once?

In [None]:
singletons = tag_counts[tag_counts["count"] == 1]
essay_numbers['singletonTags'] = f"{singletons.shape[0]:,}"

singletons.head(10)

In [None]:
print(f"{singletons.shape[0]:,} tags used only once")

## Languages and Scripts



### What languages are most common?

In [None]:
# combine primary and secondary languages into an all-lang field


def all_langs(row):
  # apparently we can't assume that secondary lang implies primary!
    if pd.notna(row.languages_primary):
      if pd.notna(row.languages_secondary):
        return ','.join([row.languages_primary, row.languages_secondary])
      else:
        return row.languages_primary

    # at least one case has a secondary lang without primary; return it
    if pd.notna(row.languages_secondary):
      return row.languages_secondary

documents['languages_all'] = documents.apply(all_langs, axis=1)
documents[['pgpid', 'languages_primary', 'languages_secondary', 'languages_all']].head()

In [None]:
# as we did with tags, split and explode the combined languages into list
documents['langs_all_list'] = documents.languages_all.apply(lambda x: (x.strip().split(",")) if pd.notna(x) else [])

docs_langs = documents[['pgpid', 'langs_all_list']].explode('langs_all_list').rename(columns={"langs_all_list": "language"})
# filter out unset languages
docs_langs = docs_langs[docs_langs.language.notna()]
# ensure whitespace doesn't cause variation
docs_langs['language'] = docs_langs.language.apply(lambda x : x.strip())

docs_langs.head(10)

In [None]:
# total by language, for any occurrence (counting documents multiply)
doc_lang_counts = docs_langs.language.value_counts().reset_index()
doc_lang_counts.head(10)

How many unique languages total?

In [None]:
total_langs = len(doc_lang_counts.language.unique())
essay_numbers['totalLangauges'] = total_langs

In [None]:
# create a filter set of most common and unidentified

doc_lang_counts_subset = doc_lang_counts[(doc_lang_counts['count'] > 300) | doc_lang_counts.language.str.contains("Unidentified")]
# format count for output
doc_lang_counts_subset['count'] = doc_lang_counts_subset['count'].apply(lambda x: f"{x:,}")
# rename columns for output
doc_lang_counts_subset = doc_lang_counts_subset.rename(columns={"language": "Language/Script","count": "Documents"})
doc_lang_counts_subset

In [None]:
print(doc_lang_counts_subset.to_latex(index=False))

### What % of documents have more than one language?

(any language - primary or secondary)

In [None]:
# tally the number of languages using the combined list field we created before
documents['langs_all_count'] = documents.langs_all_list.apply(lambda x: len(x))

In [None]:
# add some totals to our data for use in the paper
docs_no_lang = documents[documents['langs_all_count'] == 0].shape[0]
docs_any_lang = documents[documents['langs_all_count'] != 0].shape[0]
docs_one_lang = documents[documents['langs_all_count'] == 1].shape[0]
docs_multi_lang = documents[documents['langs_all_count'] > 1].shape[0]
essay_numbers['documentsNoLang'] = f"{docs_no_lang:,}"
essay_numbers['documentsAnyLang'] = f"{docs_any_lang:,}"
essay_numbers['documentsOneLang'] = f"{docs_one_lang:,}"
essay_numbers['documentsMultiLang'] = f"{docs_multi_lang:,}"
# what % of documents with any language are multi/mono lingual?
essay_numbers['percentDocsMultiLang'] = f"{docs_multi_lang/docs_any_lang:.1%}"
essay_numbers['percentDocsMonoLang'] = f"{docs_one_lang/docs_any_lang:.1%}"


### Language combinations

### upSet plot

In [None]:
# limit to languages that occur a sufficient number of times
min_langcount = 300

lang_subset = doc_lang_counts[doc_lang_counts["count"] > min_langcount].language.to_list()
lang_subset

In [None]:
from upsetplot import from_memberships


# limit to documents with any of the languages in our subset
# create a filtered lang field with languages in our subset
documents['subset_langs'] = documents.langs_all_list.apply(lambda x: [l.strip() for l in x if l.strip() in lang_subset]) #list(set(l.strip() for l in x) & set(lang_subset)))

documents_subset_lang = documents[documents.subset_langs.notna() & documents.subset_langs.str.len()].copy()
documents_subset_lang[documents_subset_lang.langs_all_count > 1 ][documents_subset_lang.languages_all.str.contains("Judaeo-Arabic")][['pgpid', 'languages_all', 'langs_all_list', 'subset_langs']]

In [None]:
pd.set_option('future.no_silent_downcasting', True)

# use the combined subset language field as the membership set
lang_memberships = documents_subset_lang.subset_langs
lang_sets = from_memberships(lang_memberships)

In [None]:
from upsetplot import plot
from matplotlib import pyplot

figure = pyplot.figure()

upset_plot = plot(lang_sets, fig=figure, subset_size="count", show_counts=True) # , sort_categories_by='-input')

# NOTE: must save _before_ displaying (show clears the current plot)
# pyplot.savefig(f'{chart_dir}/language_upsetplot.pdf')
pyplot.show(upset_plot)


#### upSet plot of primary languages

In [None]:
lang_subset

In [None]:

# limit to documents with any of the languages in our subset
# create a filtered lang field with languages in our subset
documents['primary_lang_list'] = documents.languages_primary.apply(lambda x: [l.strip() for l in x.split(',')] if pd.notna(x) else [])

documents['subset_primary_langs'] = documents.primary_lang_list.apply(lambda x: [l.strip() for l in x if l.strip() in lang_subset]) #list(set(l.strip() for l in x) & set(lang_subset)))

documents_primary_subset_lang = documents[documents.subset_primary_langs.notna() & documents.subset_primary_langs.str.len()]
documents_primary_subset_lang[documents_primary_subset_lang.langs_all_count > 1 ][documents_primary_subset_lang.languages_all.str.contains("Ladino")][['pgpid', 'languages_primary', 'primary_lang_list', 'subset_primary_langs']].head(10)

In [None]:
# use the combined subset language field as the membership set
primary_lang_memberships = documents_primary_subset_lang.subset_primary_langs
primary_lang_sets = from_memberships(primary_lang_memberships)

In [None]:
from upsetplot import UpSet


upset_plot_primary = UpSet(primary_lang_sets, subset_size="count", show_counts=True, sort_by='cardinality', sort_categories_by='-cardinality').plot()

pyplot.savefig(f'{chart_dir}/primary_language_upsetplot.pdf')

pyplot.savefig(f'{chart_dir}/language_upsetplot.pdf')
pyplot.show(upset_plot_primary)

## Dates and Calendars

In [None]:
# limit to documents with standard date OR inferred date information
dated_docs = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()].copy()
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard']]

In [None]:
# collect dates into a single field
dated_docs['date'] = dated_docs.apply(lambda x: x.doc_date_standard.strip() if pd.notna(x.doc_date_standard) else x.inferred_date_standard.strip(), axis=1)
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

In [None]:
# get totals for including in the paper

total_docs_anydate = dated_docs[dated_docs.date.notna()].shape[0]
total_date_on_doc = dated_docs[dated_docs.doc_date_standard.notna()].shape[0]
# inferred date only (don't recount date on doc)
total_inferred_date = dated_docs[dated_docs.doc_date_standard.isna() & dated_docs.inferred_date_standard.notna()].shape[0]
essay_numbers.update({
    'totalDatedDocs': f"{total_docs_anydate:,}",
    'totalDateOnDoc': f"{total_date_on_doc:,}",
    'totalInferredDate': f"{total_inferred_date:,}"
})
essay_numbers

In [None]:
dated_docs[dated_docs.date.str.contains("1217-02-29")][['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

In [None]:
dated_docs[dated_docs.date.str.contains("1139/")][['pgpid', 'doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

In [None]:
# parse with undate
from undate import Undate, UndateInterval
from lark.exceptions import VisitError

def parse_date(datestr):
  # potentially multiple values
  if ';' in datestr:
    print(f"multiple inferred dates: {datestr}")
    all_dates = [parse_date(d) for d in datestr.split(';')]
    for d in all_dates:
      print(f"{d}: {d.duration().days} days")
    # earliest = min(all_dates)
    # latest = max(all_dates)
    try:
      earliest = min([d.earliest for d in all_dates])
      latest = max([d.latest for d in all_dates])
      print(f"all dates: {all_dates} earliest {earliest} latest {latest}")
  #    range = UndateInterval(Undate.to_undate(earliest), Undate.to_undate(latest))
    # can we just cheat and use first/last
      range = UndateInterval(Undate(earliest.year, earliest.month, earliest.day), Undate(latest.year, latest.month, latest.day))
      print(f'range {range}')
      return range
    except:
      print("error sorting to determine outer interval")
  else:
    try:
      return Undate.parse(datestr.strip(), "EDTF")
    except (ValueError, VisitError) as err:
      # special cases
      if datestr.endswith("-02-29"):
        print(f"parse error on {datestr}, parsing as --02-28")
        return parse_date(datestr.replace("-02-29", "-02-28"))
      if "/" in datestr:
        parts = datestr.split("/")
        # data entry error: 1139/1139 ; corrected in db, but use first for now
        if parts[0] == parts[1]:
          return parse_date(parts[0])
      print(err)

dated_docs['undate'] = dated_docs.date.apply(parse_date) # lambda x: Undate.parse(x, 'EDTF'))
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date', 'undate']].head(10)

In [None]:
import numpy as np

def undate_earliest(und):
  if isinstance(und, UndateInterval):
    return und.earliest.earliest
  elif isinstance(und, Undate):
    return und.earliest

def undate_latest(und):
  if isinstance(und, UndateInterval):
    return und.latest.earliest
  elif isinstance(und, Undate):
    return und.latest


# get earliest, latest, midpoint, and convert to numpy datetime for graphing
dated_docs['undate_earliest'] = dated_docs.undate.apply(undate_earliest).astype('datetime64[ms]')
dated_docs['undate_latest'] = dated_docs.undate.apply(undate_latest).astype('datetime64[ms]')
dated_docs['undate_midpoint'] = dated_docs.apply(lambda row: (row.undate_earliest + (row.undate_latest - row.undate_earliest) / 2.0) if pd.notna(row.undate) else None, axis=1).astype('datetime64[ms]')
dated_docs['date_source'] = dated_docs.apply(lambda row: "On document" if pd.notna(row.doc_date_standard) else "Inferred", axis=1)


dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date', 'undate', 'undate_earliest', 'undate_latest', 'undate_midpoint', 'date_source']].head(10)

In [None]:
bar_chart = alt.Chart(dated_docs[['pgpid', 'undate_earliest', 'undate_latest', 'date_source']].sort_values('date_source')).mark_bar(opacity=0.15).encode(
    x=alt.X('undate_earliest', title="Year"), #, axis=alt.Axis(format="r")),
    x2='undate_latest',
    y=alt.Y('count(pgpid)', title='Documents'),
    color=alt.Color("date_source", title="Dating").scale(domain=['On document', 'Inferred']),
).properties(width=900, height=175)

line_chart = alt.Chart(dated_docs[['pgpid', 'undate_midpoint']]).mark_line(opacity=0.6, color="green", interpolate="monotone").encode(
 x=alt.X('undate_midpoint:T', title="Year"), # axis=alt.Axis(format="r")),
 y=alt.Y('count(pgpid)', title='Documents')
).properties(width=900, height=175)


combined_dating_chart = (line_chart + bar_chart).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=10,
    cornerRadius=5,
    orient='top-left'
)

combined_dating_chart.save(f'{chart_dir}/combined_dating.pdf')
combined_dating_chart

In [None]:
# graph documents with calendars

date_docs_cal = dated_docs[dated_docs.doc_date_standard.notna()]

dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})
dated_docs_cal['midpoint_year'] = dated_docs_cal.undate_midpoint.apply(lambda x: x.year)

docs_calendars_charts = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(
  x=alt.X('midpoint_year', title="CE Year (Julian/Gregorian)", bin=alt.Bin(maxbins=120), axis=alt.Axis(format="r")),
  y=alt.Y('count(pgpid)', title='Documents'),
  color=alt.Y("doc_date_calendar", title="Calendar")
).properties(width=900, height=200,
            #  title={
            #      "text": "Documents by date and original calendar",
            #    "subtitle": "(Graphed based on date midpoint for uncertain dates and date ranges)"
            #  }
             ).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=10,
    cornerRadius=5,
    orient='top-left'
)

docs_calendars_charts.save(f"{chart_dir}/dated_docs_by_cal.pdf")
docs_calendars_charts

## Scholarship records

In [None]:
sources = pd.read_csv(pgp_csvs["sources"])
sources.head()

In [None]:
print(f"{sources.shape[0]:,} sources")

In [None]:
sources.source_type.value_counts().reset_index()

In [None]:
source_totals = sources.groupby('source_type').agg(
    count=pd.NamedAgg(column="source_type", aggfunc="count"),
    num_footnotes=pd.NamedAgg(column='num_footnotes', aggfunc='sum')
    ).reset_index()
source_totals = source_totals.sort_values('count', ascending=False)
source_totals = source_totals.rename(columns={"source_type": "Type", "count": "Sources", "num_footnotes": "Footnotes"})
source_totals

In [None]:
print(source_totals.to_latex(index=False))

In [None]:
# how many documents do sources relate to?
# goitein sources - split out into volumes to wrangle the scale
# what does it look like if we leave Goitein out?

sources.num_footnotes.value_counts().reset_index().sort_values("num_footnotes").head(10)

In [None]:
source_authors = sources.copy()
source_authors["author_list"] = sources.authors.apply(lambda x : x.split(';') if pd.notna(x) else None)
source_authors.head()
source_authors = source_authors[['source_type', 'title', 'author_list']].explode("author_list").rename(columns={"author_list": "author"})
# filter out unset authors
source_authors = source_authors[source_authors.author.notna()]

source_authors["author"] = source_authors.author.apply(lambda x: x.strip())
source_authors.head(10)

In [None]:
print(f"{len(source_authors.author.unique()):,} unique authors")

In [None]:
source_authors.author.value_counts().head(12)

## Rate of change

### Last modified?



In [None]:
# parse initial entry as date time
documents['last_modified'] = pd.to_datetime(documents['last_modified'], format='ISO8601')

In [None]:
docs_modified = documents.groupby(pd.Grouper(key="last_modified", freq="W-MON"))[['pgpid']].count().reset_index().rename(columns={'pgpid': 'total'})
docs_modified.head()

In [None]:
# TODO rename total/cumulative total so they make sense here
docs_modified_chart = alt.Chart(docs_modified).mark_area().encode(
    y=alt.Y('total', title="Documents").scale(domain=[0, 800], clamp=True),
    x=alt.X('last_modified', title="Last Modified", axis=alt.Axis(format="%Y", tickCount="year")),
    # color=alt.Color("status", title="").scale(scheme="tableau20"),
).properties(
    width=900,
    height=200
)

docs_modified_chart.save(f'{chart_dir}/docs_last_modified.pdf')
docs_modified_chart

### Stats from GitHub versions

PGP metadata is backed up to a GitHub repository. We can use the repository history to assess the rate of change.


In [None]:
dataset_stats = pd.read_csv("../dataset-history/pgp-dataset-history.csv")

dataset_stats.head()

In [None]:
doc_frag_totals = dataset_stats[dataset_stats.type.isin(['documents', 'fragments', 'fragment_images'])]

totals_by_time = alt.Chart(doc_frag_totals).mark_line().encode(
    y=alt.Y('count', title="Total").scale(domain=[15000, 40000]),
    x=alt.X('date:T', title="Date", axis=alt.Axis(format="%Y", tickCount="year")),
    color=alt.Color("type", title="").scale(scheme="tableau10"),
).properties(
    width=900,
    height=200
# can't configure here if we want to combine
# ).configure_legend(
#     strokeColor='gray',
#     fillColor='white', #EEEEEE',
#     padding=15,
#     cornerRadius=5,
#     # orient='top-left',
#     orient='none',
#     legendX=20, legendY=80,
)
totals_by_time

In [None]:
text_totals = dataset_stats[dataset_stats.type.isin(['transcriptions', 'translations'])]

text_totals_by_time = alt.Chart(text_totals).mark_line().encode(
    y=alt.Y('count', title="Total"),
    x=alt.X('date:T', title="Date", axis=alt.Axis(format="%Y", tickCount="year")),
    color=alt.Color("type", title="") # .scale(scheme="tableau10", reverse=True),
).properties(
    width=900,
    height=170
)
# .configure_legend(
#     strokeColor='gray',
#     fillColor='white', #EEEEEE',
#     padding=15,
#     cornerRadius=5,
#     # orient='top-left',
#     orient='none',
#     legendX=20, legendY=80,
# )
text_totals_by_time

In [None]:
combined_totals = (totals_by_time & text_totals_by_time).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=15,
    cornerRadius=5,
    # orient='top-left',
    orient='none',
    legendX=750, legendY=50,
)

combined_totals.save(f'{chart_dir}/combined_totals_historic.pdf')
combined_totals

## Output numbers for use in dataset essay

In [None]:
essay_numbers

In [None]:
# output numbers for the essay as LaTeX defines so they can be referenced in-text
print('%% numbers from data')
for name, value in essay_numbers.items():
  # ... or don't [add a space since latex seems to swallow the following space in paragraph text]
  # escape special chars in the output
  print(r"\def\%s{%s}" % (name, str(value).replace('%', '\%')))