## Metadata fields available

To what extent are metadata fields filled out?


In [9]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths, chart_dir

documents = pd.read_csv(pgp_csv_paths["documents"])

total_documents = documents.shape[0]

In [10]:
# calculate what percent of records have which field (code adapted from S&co dataset notebook)
def percent_known(df, field):
  total = len(df)
  with_field = len(df[df[field].notnull()])
  print('%d with %s : %.00f%%' % (with_field, field, (with_field/total)*100))
  return with_field

In [11]:
total_known = {}
metadata_fields = ['type', 'description', 'tags', 'languages_primary', 'languages_secondary', 'doc_date_original', 'doc_date_standard']

for field in metadata_fields:
    total_known[field] = percent_known(documents, field)

print(total_known)

31019 with type : 88%
35192 with description : 100%
16661 with tags : 47%
29029 with languages_primary : 82%
1645 with languages_secondary : 5%
3809 with doc_date_original : 11%
4127 with doc_date_standard : 12%
{'type': 31019, 'description': 35192, 'tags': 16661, 'languages_primary': 29029, 'languages_secondary': 1645, 'doc_date_original': 3809, 'doc_date_standard': 4127}


In [12]:
# are there any records where secondary language is known but primary is not?
documents[documents.languages_primary.isna() & documents.languages_secondary.notna()]

Unnamed: 0,pgpid,url,iiif_urls,fragment_urls,shelfmark,multifragment,side,region,type,tags,...,inferred_date_standard,inferred_date_rationale,inferred_date_notes,initial_entry,last_modified,input_by,library,collection,has_transcription,has_translation
9850,12556,https://geniza.princeton.edu/documents/12556/,https://figgy.princeton.edu/concern/scanned_re...,,ENA 3485.4,,,,List or table,list,...,,,,2017-08-01 04:00:00+00:00,2022-08-18 15:13:08.982739+00:00,Amir Ashur ; Marina Rustow,JTS,JTS,N,N
20129,23995,https://geniza.princeton.edu/documents/23995/,,,T-S NS 224.173,,,,,,...,,,,2019-01-01 05:00:00+00:00,2022-03-06 12:46:52.132307+00:00,Amir Ashur ; Marina Rustow,CUL,"CUL, T-S",N,N
21851,25941,https://geniza.princeton.edu/documents/25941/,,,T-S NS 338.53,,,,Legal document,,...,,,,2019-01-01 05:00:00+00:00,2021-12-29 19:19:22.004847+00:00,Alan Elbaum ; Jessica Parker ; Marina Rustow,CUL,"CUL, T-S",N,N
24257,28685,https://geniza.princeton.edu/documents/28685/,https://princetongenizalab.github.io/iiif/jrl/...,https://luna.manchester.ac.uk/luna/servlet/vie...,JRL SERIES B 4501,,,,List or table,FGP stub,...,,,,2019-01-01 05:00:00+00:00,2021-09-25 15:58:48.429095+00:00,Amir Ashur ; Marina Rustow,JRL,JRL,N,N


In [13]:
# specify keys as index to preserve field order as specified above
known = pd.DataFrame({'total': total_known}, index=total_known.keys())
known['percent'] = (known['total'] / total_documents*100)
# known['order'] = known.loc.apply(lambda x: metadata_fields.index(x))
known.percent = known.percent.round(2)
known

Unnamed: 0,total,percent
type,31019,88.14
description,35192,99.99
tags,16661,47.34
languages_primary,29029,82.48
languages_secondary,1645,4.67
doc_date_original,3809,10.82
doc_date_standard,4127,11.73


In [14]:

total_documents = len(documents.index)
known_unknown = pd.DataFrame()
known_unknown['known'] = known['total']
known_unknown['unknown'] = known.total.apply(lambda x: total_documents - x)
known_unknown

Unnamed: 0,known,unknown
type,31019,4175
description,35192,2
tags,16661,18533
languages_primary,29029,6165
languages_secondary,1645,33549
doc_date_original,3809,31385
doc_date_standard,4127,31067


In [15]:
# reshape for plotting
known_unknown_melted = known_unknown.reset_index().melt(id_vars="index", value_vars=["known", "unknown"])

known_unknown_melted = known_unknown_melted.rename(columns={"index": "field", "variable": "status", "value": "documents"})
known_unknown_melted.head(10)

Unnamed: 0,field,status,documents
0,type,known,31019
1,description,known,35192
2,tags,known,16661
3,languages_primary,known,29029
4,languages_secondary,known,1645
5,doc_date_original,known,3809
6,doc_date_standard,known,4127
7,type,unknown,4175
8,description,unknown,2
9,tags,unknown,18533


In [16]:
field_order = ["type", "description", "tags", "languages_primary", "languages_secondary", "doc_date_original", "doc_date_standard"]

known_melted = known_unknown_melted[known_unknown_melted.status == "known"]

metadata_chart = alt.Chart(known_melted).mark_bar().encode(
    x=alt.X("documents", title="Documents with data populated").scale(domain=[0, total_documents]),
    y=alt.Y("field", title="Metadata field").scale(domain=field_order),
    # color=alt.Color("status").scale(domain=["known", "unknown"], range=["#7bac7b", "lightgray"]),
).properties(width=650, height=200)

metadata_chart

In [17]:
known_melted['label'] = known_melted.documents.apply(lambda x: f'{x:,} ({x/total_documents:.0%})')

# text labels for metadata chart
text = alt.Chart(known_melted).mark_text(
     dy=0,
     color=alt.expr(alt.expr.if_(alt.datum.documents > 5000, "white", "black")),
     align=alt.expr(alt.expr.if_(alt.datum.documents > 5000, "right", "left")),
     dx=alt.expr(alt.expr.if_(alt.datum.documents > 5000, -10, 5))
  ).encode(
    x=alt.X('documents').scale(domain=[0, total_documents]),
    y=alt.Y("field", title="Metadata field").scale(domain=field_order),
    text=alt.Text('label')
)

(metadata_chart + text).save(f"{chart_dir}/metadata_available.pdf")
metadata_chart + text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_melted['label'] = known_melted.documents.apply(lambda x: f'{x:,} ({x/total_documents:.0%})')
