## Numbers for use in dataset essay

Collect totals and other numbers and output as LaTeX variables, for use in the dataset essay.

In [2]:
import pandas as pd

from utils import pgp_csv_paths

## Documents & Fragments

In [3]:
documents = pd.read_csv(pgp_csv_paths["documents"])
total_documents = documents.shape[0]

In [4]:
documents_fragments = documents.copy()
documents_fragments["fragments"] = documents_fragments.shelfmark.apply(lambda x : x.split(' + '))
documents_fragments["num_fragments"] = documents_fragments.fragments.apply(lambda x : len(x))
single_frag_docs = documents_fragments[documents_fragments.num_fragments == 1].shape[0]
joins = documents_fragments[documents_fragments.num_fragments > 1].shape[0]

essay_numbers = {
    "totalDocuments": f"{total_documents:,}",
    "singleFragmentDocuments": f"{single_frag_docs:,}",
    "totalJoins": f"{joins:,}",
}

## Document Types

In [5]:
import re

# copy dataframe and replace unset type with "Unknown" so we can include it in reported type totals
docs_with_unknowns = documents.copy()
docs_with_unknowns["type"] = docs_with_unknowns.type.fillna("Unknown")
doctype_totals = docs_with_unknowns.type.value_counts()

def texOutputLabel(term):
  # convert a variable name or string into format for tex variable output
  # remove numbers and dashes
  term = re.sub(r'[-\d]', '', term)
  # remove spaces, and use title/camel case
  return ''.join([t.title() for t in term.split()])

for doctype, total in doctype_totals.to_dict().items():
  # add to output numbers for essay; prefix with total and convert for output
  outputLabel = f"total{texOutputLabel(doctype)}"
  essay_numbers[outputLabel] = f"{total:,}"

## Tags

In [6]:
documents['tag_list'] = documents.tags.apply(lambda x: x.strip().split(", ") if pd.notna(x) else [])
docs_tags = documents[['pgpid', 'tag_list']].explode('tag_list').rename(columns={'tag_list': 'tag'})
# filter out unset tags
docs_tags = docs_tags[docs_tags.tag.notna()]
essay_numbers['uniqueTags'] = f"{len(docs_tags.tag.unique()):,}"

tag_counts = docs_tags.tag.value_counts().reset_index()

# add top 7 most common tags to output numbers for inclusion in essay
for tagcount in tag_counts.head(6).itertuples():
  essay_numbers[f'tagged{texOutputLabel(tagcount.tag)}'] = tagcount.count

singletons = tag_counts[tag_counts["count"] == 1]
essay_numbers['singletonTags'] = f"{singletons.shape[0]:,}"

## Languages

In [7]:
# combine primary and secondary languages into an all-lang field

def all_langs(row):
  # apparently we can't assume that secondary lang implies primary!
    if pd.notna(row.languages_primary):
      if pd.notna(row.languages_secondary):
        return ','.join([row.languages_primary, row.languages_secondary])
      else:
        return row.languages_primary

    # at least one case has a secondary lang without primary; return it
    if pd.notna(row.languages_secondary):
      return row.languages_secondary


documents['languages_all'] = documents.apply(all_langs, axis=1)

# split and explode the combined languages into list
documents['langs_all_list'] = documents.languages_all.apply(lambda x: (x.strip().split(",")) if pd.notna(x) else [])

docs_langs = documents[['pgpid', 'langs_all_list']].explode('langs_all_list').rename(columns={"langs_all_list": "language"})
# filter out unset languages
docs_langs = docs_langs[docs_langs.language.notna()]
# ensure whitespace doesn't cause variation
docs_langs['language'] = docs_langs.language.apply(lambda x : x.strip())

doc_lang_counts = docs_langs.language.value_counts().reset_index()

total_langs = len(doc_lang_counts.language.unique())

# tally the number of languages using the combined list field we created before
documents['langs_all_count'] = documents.langs_all_list.apply(lambda x: len(x))

docs_no_lang = documents[documents['langs_all_count'] == 0].shape[0]
docs_any_lang = documents[documents['langs_all_count'] != 0].shape[0]
docs_one_lang = documents[documents['langs_all_count'] == 1].shape[0]
docs_multi_lang = documents[documents['langs_all_count'] > 1].shape[0]

essay_numbers.update({
    'totalLangauges': f"{len(doc_lang_counts.language.unique()):,}",
    'documentsNoLang': f"{docs_no_lang:,}",
    'documentsAnyLang': f"{docs_any_lang:,}",
    'documentsOneLang': f"{docs_one_lang:,}",
    'documentsMultiLang': f"{docs_multi_lang:,}",
    # what % of documents with any language are multi/mono lingual?
    'percentDocsMultiLang': f"{docs_multi_lang/docs_any_lang:.1%}",
    'percentDocsMonoLang': f"{docs_one_lang/docs_any_lang:.1%}"
})

## Dates

In [9]:
# limit to documents with standard date OR inferred date information
dated_docs = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()].copy()

# collect dates into a single field
dated_docs['date'] = dated_docs.apply(lambda x: x.doc_date_standard.strip() if pd.notna(x.doc_date_standard) else x.inferred_date_standard.strip(), axis=1)


total_docs_anydate = dated_docs[dated_docs.date.notna()].shape[0]
total_date_on_doc = dated_docs[dated_docs.doc_date_standard.notna()].shape[0]
# inferred date only (don't recount date on doc)
total_inferred_date = dated_docs[dated_docs.doc_date_standard.isna() & dated_docs.inferred_date_standard.notna()].shape[0]
essay_numbers.update({
    'totalDatedDocs': f"{total_docs_anydate:,}",
    'percentDatedDocs': f"{total_docs_anydate / total_documents:.1%}",
    'totalDateOnDoc': f"{total_date_on_doc:,}",
    'totalInferredDate': f"{total_inferred_date:,}"
})

In [10]:
essay_numbers

{'totalDocuments': '35,194',
 'singleFragmentDocuments': '33,644',
 'totalJoins': '1,550',
 'totalLetter': '11,212',
 'totalLegalDocument': '7,868',
 'totalListOrTable': '5,415',
 'totalUnknown': '4,175',
 'totalLiteraryText': '2,338',
 'totalStateDocument': '2,096',
 'totalParaliteraryText': '1,130',
 'totalCreditInstrumentOrPrivateReceipt': '554',
 'totalLegalQueryOrResponsum': '401',
 'totalInscription': '5',
 'uniqueTags': '2,674',
 'taggedDimme': 1640,
 'taggedAccount': 759,
 'taggedCommunal': 751,
 'taggedIllness': 691,
 'taggedIllnessLetter': 648,
 'taggedArabicScript': 629,
 'singletonTags': '1,064',
 'totalLangauges': '54',
 'documentsNoLang': '6,161',
 'documentsAnyLang': '29,033',
 'documentsOneLang': '22,443',
 'documentsMultiLang': '6,590',
 'percentDocsMultiLang': '22.7%',
 'percentDocsMonoLang': '77.3%',
 'totalDatedDocs': '4,455',
 'percentDatedDocs': '12.7%',
 'totalDateOnDoc': '4,127',
 'totalInferredDate': '328'}

## People & Places

In [11]:
# how many records in people & places data files?

essay_numbers["totalPeopleRecords"] = pd.read_csv(pgp_csv_paths['people']).shape[0]
essay_numbers["totalPlaceRecords"] = pd.read_csv(pgp_csv_paths['places']).shape[0]

In [12]:
essay_numbers

{'totalDocuments': '35,194',
 'singleFragmentDocuments': '33,644',
 'totalJoins': '1,550',
 'totalLetter': '11,212',
 'totalLegalDocument': '7,868',
 'totalListOrTable': '5,415',
 'totalUnknown': '4,175',
 'totalLiteraryText': '2,338',
 'totalStateDocument': '2,096',
 'totalParaliteraryText': '1,130',
 'totalCreditInstrumentOrPrivateReceipt': '554',
 'totalLegalQueryOrResponsum': '401',
 'totalInscription': '5',
 'uniqueTags': '2,674',
 'taggedDimme': 1640,
 'taggedAccount': 759,
 'taggedCommunal': 751,
 'taggedIllness': 691,
 'taggedIllnessLetter': 648,
 'taggedArabicScript': 629,
 'singletonTags': '1,064',
 'totalLangauges': '54',
 'documentsNoLang': '6,161',
 'documentsAnyLang': '29,033',
 'documentsOneLang': '22,443',
 'documentsMultiLang': '6,590',
 'percentDocsMultiLang': '22.7%',
 'percentDocsMonoLang': '77.3%',
 'totalDatedDocs': '4,455',
 'percentDatedDocs': '12.7%',
 'totalDateOnDoc': '4,127',
 'totalInferredDate': '328',
 'totalPeopleRecords': 1333,
 'totalPlaceRecords': 409

## Output as LaTeX variables

In [13]:
# output numbers for the essay as LaTeX defines so they can be referenced in-text

def numbers_to_latex(essay_numbers):
    # output numbers for the essay as LaTeX defines so they can be referenced in-text
    result = []
    result.append("%% numbers from data")
    for name, value in essay_numbers.items():
        # ... or don't [add a space since latex seems to swallow the following space in paragraph text]
        # escape special chars in the output
        result.append(r"\def\%s{%s}" % (name, str(value).replace("%", r"\%")))
    return "\n".join(result)
    
print(numbers_to_latex(essay_numbers))

%% numbers from data
\def\totalDocuments{35,194}
\def\singleFragmentDocuments{33,644}
\def\totalJoins{1,550}
\def\totalLetter{11,212}
\def\totalLegalDocument{7,868}
\def\totalListOrTable{5,415}
\def\totalUnknown{4,175}
\def\totalLiteraryText{2,338}
\def\totalStateDocument{2,096}
\def\totalParaliteraryText{1,130}
\def\totalCreditInstrumentOrPrivateReceipt{554}
\def\totalLegalQueryOrResponsum{401}
\def\totalInscription{5}
\def\uniqueTags{2,674}
\def\taggedDimme{1640}
\def\taggedAccount{759}
\def\taggedCommunal{751}
\def\taggedIllness{691}
\def\taggedIllnessLetter{648}
\def\taggedArabicScript{629}
\def\singletonTags{1,064}
\def\totalLangauges{54}
\def\documentsNoLang{6,161}
\def\documentsAnyLang{29,033}
\def\documentsOneLang{22,443}
\def\documentsMultiLang{6,590}
\def\percentDocsMultiLang{22.7\%}
\def\percentDocsMonoLang{77.3\%}
\def\totalDatedDocs{4,455}
\def\percentDatedDocs{12.7\%}
\def\totalDateOnDoc{4,127}
\def\totalInferredDate{328}
\def\totalPeopleRecords{1333}
\def\totalPlaceRecor