## Documents and Fragments

In [36]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths, chart_dir

documents = pd.read_csv(pgp_csv_paths["documents"])

In [37]:
total_documents = documents.shape[0]
print(f"{total_documents:,} total documents")

35,194 total documents


### Documents by number of fragments

How many fragments are typically used to compose a single document?

The `shelfmark` field in the data is a composite field; a " + " is used to indicate multiple shelfmarks.

NOTE: this does not account for shelfmark overrides, e.g. ranges of shelfmarks.

In [38]:
documents_fragments = documents.copy()

documents_fragments["fragments"] = documents_fragments.shelfmark.apply(lambda x : x.split(' + '))
documents_fragments["num_fragments"] = documents_fragments.fragments.apply(lambda x : len(x))
documents_fragments.num_fragments.value_counts().head(10)

num_fragments
1     33644
2      1178
3       183
4        87
5        45
6        24
8        11
7         8
11        2
17        2
Name: count, dtype: int64

What percent of documents occur on a single fragment?

In [39]:
single_frag_docs = documents_fragments[documents_fragments.num_fragments == 1].shape[0]
single_frag_docs

33644

In [40]:
joins = documents_fragments[documents_fragments.num_fragments > 1].shape[0]
print(f"{joins:,} total joins")

1,550 total joins


What are some examples of documents that span more than 6 fragments?

In [41]:
documents_fragments[documents_fragments.num_fragments > 6][['pgpid', 'shelfmark', 'type', 'description', ]].head(10)

Unnamed: 0,pgpid,shelfmark,type,description
3983,5371,T-S NS 184.58 + T-S NS 184.62 + T-S NS 184.50 ...,Legal document,"India Book (IB) I, 34a.\r\nLegal proceedings w..."
4176,5634,Bodl. MS heb. f 56/14 + Bodl. MS heb. f 56/15 ...,Literary text,Copy of a story about the edicts against the J...
6385,8297,T-S Ar.50.197 + T-S Ar.50.154 + T-S G1.80 + T-...,Legal query or responsum,Book of responsa. Joins by Amir Ashur. See FGP...
7433,9813,ENA 330.10 + ENA 330.12 + ENA 330.13 + ENA 330...,List or table,"Donations ledger, ca. 1800 CE. There are sever..."
7491,9938,ENA 1066.1 + ENA 1066.2 + ENA 1066.3 + ENA 106...,List or table,"Ledger of accounts, late, of a Venetian merchant."
7496,9952,ENA NS I.57 + CUL Or.1080 J253 + T-S Ar.35.382...,List or table,Ledger of expenses and incomes of the Mustaʿri...
7681,10150,ENA 3450.14 + ENA 3450.17 + ENA 3450.20 + ENA ...,Literary text,Booklet consisting mostly of a text that seems...
7881,10392,ENA 3938.5 + ENA 3938.6 + ENA 3938.7 + ENA 393...,Literary text,Seven pages of an unidentified long-form work ...
8327,10864,ENA 3980.1 + ENA 3980.2 + ENA 3980.3 + ENA 398...,Literary text,Literary text in Arabic script.
8960,11576,ENA NS 77.7 + ENA NS 77.16 + ENA NS 77.17 + EN...,Legal document,Legal document. Seven fragments joined togethe...


In [42]:
# aggregate documents by the number of fragments they span, for graphing
docs_per_num_fragments = documents_fragments[['num_fragments', 'pgpid']].groupby('num_fragments').count().reset_index().rename(columns={"pgpid": "count"})
# calculate a percentage of the total
docs_per_num_fragments["percent"] = docs_per_num_fragments["count"].apply(lambda x : f"{(x / total_documents):.2%}")

docs_per_num_fragments.head(10)

Unnamed: 0,num_fragments,count,percent
0,1,33644,95.60%
1,2,1178,3.35%
2,3,183,0.52%
3,4,87,0.25%
4,5,45,0.13%
5,6,24,0.07%
6,7,8,0.02%
7,8,11,0.03%
8,9,1,0.00%
9,10,1,0.00%


What does the long tail look like, of documents spanning a large number of fragments?

In [43]:
# TODO: think about whether this can be condensed into a meaniengful table to display alongside the chart
docs_per_num_fragments.tail(10)

Unnamed: 0,num_fragments,count,percent
10,11,2,0.01%
11,13,1,0.00%
12,15,1,0.00%
13,16,1,0.00%
14,17,2,0.01%
15,22,1,0.00%
16,25,1,0.00%
17,32,1,0.00%
18,34,1,0.00%
19,64,1,0.00%


In [44]:
# group for display in a table
# - get values for all documents on fewer than 7 fragments
docs_per_nfrags = docs_per_num_fragments[docs_per_num_fragments.num_fragments < 7]
# - aggregate values for 7+
docs_manyfrags = docs_per_num_fragments[docs_per_num_fragments.num_fragments > 7]
total_docs_7plusfrags = docs_manyfrags["count"].sum()
pct_docs_7plusfrags =  f"{(total_docs_7plusfrags / total_documents):.2%}"

agg_values = {"num_fragments": f"{docs_manyfrags.num_fragments.min()}-{docs_manyfrags.num_fragments.max()}", "count": total_docs_7plusfrags, "percent": pct_docs_7plusfrags}
docs_per_nfrags = docs_per_nfrags._append(agg_values, ignore_index=True)
docs_per_nfrags

Unnamed: 0,num_fragments,count,percent
0,1,33644,95.60%
1,2,1178,3.35%
2,3,183,0.52%
3,4,87,0.25%
4,5,45,0.13%
5,6,24,0.07%
6,8-64,25,0.07%


In [45]:
print(docs_per_nfrags.to_latex(index=False))

\begin{tabular}{lrl}
\toprule
num_fragments & count & percent \\
\midrule
1 & 33644 & 95.60% \\
2 & 1178 & 3.35% \\
3 & 183 & 0.52% \\
4 & 87 & 0.25% \\
5 & 45 & 0.13% \\
6 & 24 & 0.07% \\
8-64 & 25 & 0.07% \\
\bottomrule
\end{tabular}



In [46]:

docs_frags_chart = alt.Chart(docs_per_num_fragments).mark_bar().encode(
    x=alt.X('num_fragments', title='Number of fragments'),
    y=alt.Y('count', title='Documents').scale(type="symlog")
).properties(
    # title='Documents by number of fragments',
    width=400
)
docs_frags_chart.save(f'{chart_dir}/documents_per_num_frags.pdf')
docs_frags_chart

### Fragments by number of documents

If we use fragments as our starting point, what is the distribution? How many documents are typically written on a fragment, and are there similar outliers?

In [47]:
fragments = pd.read_csv(pgp_csv_paths["fragments"])
total_fragments = fragments.shape[0]

print(f"{total_fragments:,} fragments")

35,608 fragments


This data export has duplicates! How many?

In [48]:
print(f"{fragments[fragments.duplicated()].shape[0]:,} duplicates")

0 duplicates


In [49]:
# drop duplicates for now (but should fix in the data export)

uniq_fragments = fragments.drop_duplicates()
total_uniq_fragments = uniq_fragments.shape[0]

print(f"{total_uniq_fragments:,} fragments")

35,608 fragments


The fragments dataset includes a delimited list of associated PGPIDs; we can use that to determine how many documents are associated with a fragment.

In [50]:
uniq_fragments["num_documents"] = uniq_fragments.pgpids.apply(lambda x : len(x.split(';')))
uniq_fragments.num_documents.value_counts()

num_documents
1     33786
2      1657
3        94
4        32
5        28
6         7
7         2
8         1
25        1
Name: count, dtype: int64

What are some examples of fragments with a large number of documents?

In [51]:
uniq_fragments[uniq_fragments.num_documents > 6][['shelfmark', 'pgpids', 'num_documents']].head(10)

Unnamed: 0,shelfmark,pgpids,num_documents
16051,NLI 577.1/54,33643 ; 38522 ; 38523 ; 38524 ; 38525 ; 38526 ...,7
16053,NLI 577.1/56,33645 ; 38528 ; 38528 ; 38529 ; 38529 ; 38530 ...,8
18451,T-S 13J5.1,1156 ; 2121 ; 6178 ; 39563 ; 39564 ; 39565 ; 3...,7
22461,T-S Ar.54.19,9248 ; 39116 ; 39117 ; 39118 ; 39119 ; 39120 ;...,25


In [52]:
# group by number of documents to aggregrate for plotting
frags_per_num_documents = uniq_fragments[['num_documents', 'shelfmark']].groupby('num_documents').count().reset_index().rename(columns={"shelfmark": "fragments"})
# calculate percenatage of total fragements
frags_per_num_documents["percent"] = frags_per_num_documents.fragments.apply(lambda x : (x / total_fragments) * 100)
frags_per_num_documents

Unnamed: 0,num_documents,fragments,percent
0,1,33786,94.883172
1,2,1657,4.653449
2,3,94,0.263986
3,4,32,0.089867
4,5,28,0.078634
5,6,7,0.019659
6,7,2,0.005617
7,8,1,0.002808
8,25,1,0.002808


In [53]:
# chart with log scale
frags_docs_chart = alt.Chart(frags_per_num_documents).mark_bar().encode(
    x=alt.X('num_documents', title='Number of Documents', scale=alt.Scale(zero=False)),
    y=alt.Y('fragments', title='Fragments', scale=alt.Scale(type="symlog"))
).properties(
#    title="Fragments by number of documents",
    width=400
)
frags_docs_chart.save(f'{chart_dir}/fragments_with_num_docs.pdf')
frags_docs_chart

In [54]:

# combine horizontally for display in pdf
(docs_frags_chart | frags_docs_chart).save(f'{chart_dir}/docs_frags_2up.pdf')
(docs_frags_chart | frags_docs_chart)