## Tags

Summary information about tags. How many tags, how much reuse/variation; how densely or sparsely are documents tagged?

In [9]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths, chart_dir

documents = pd.read_csv(pgp_csv_paths["documents"])

documents.tags.head(10)

0                  communal, excommunication
1                                        NaN
2                        betrothal, Marriage
3        Levirate marriage, Marriage, Mamluk
4      Jewish community, physician, medicine
5                            Shehita, kosher
6    trade, Nahray B. Nissim, Marduk b. Musa
7                          Mevorakh b. Natan
8                ketubba, Damascus, Marriage
9                    Tyre, ketubba, Marriage
Name: tags, dtype: object

In [10]:
documents['num_tags'] = documents.tags.apply(lambda x: len(x.strip().split(", ")) if pd.notna(x) else 0)
documents[['pgpid', 'tags', 'num_tags']].head(10)

Unnamed: 0,pgpid,tags,num_tags
0,444,"communal, excommunication",2
1,445,,0
2,446,"betrothal, Marriage",2
3,447,"Levirate marriage, Marriage, Mamluk",3
4,448,"Jewish community, physician, medicine",3
5,449,"Shehita, kosher",2
6,451,"trade, Nahray B. Nissim, Marduk b. Musa",3
7,453,Mevorakh b. Natan,1
8,454,"ketubba, Damascus, Marriage",3
9,455,"Tyre, ketubba, Marriage",3


In [11]:
document_tag_counts = documents.num_tags.value_counts()
document_tag_counts

num_tags
0     18533
1      8174
2      4157
3      1993
4       978
5       559
6       325
7       163
8       114
9        74
10       45
11       31
12       14
14       12
13       11
15        5
16        2
19        2
17        2
Name: count, dtype: int64

In [12]:
alt.Chart(document_tag_counts.reset_index()).mark_bar().encode(
    alt.X("num_tags", title="# Tags").scale(domain=[0, 17]), # .bin(),
    y=alt.Y('count', title="# Documents"),
).properties(title="Tag frequency by document")

In [13]:
# split the tag list and explode out into document - tag pairs for analysis

documents['tag_list'] = documents.tags.apply(lambda x: x.strip().split(", ") if pd.notna(x) else [])

docs_tags = documents[['pgpid', 'tag_list']].explode('tag_list').rename(columns={'tag_list': 'tag'})
# filter out unset tags
docs_tags = docs_tags[docs_tags.tag.notna()]
docs_tags.head(10)

Unnamed: 0,pgpid,tag
0,444,communal
0,444,excommunication
2,446,betrothal
2,446,Marriage
3,447,Levirate marriage
3,447,Marriage
3,447,Mamluk
4,448,Jewish community
4,448,physician
4,448,medicine


How many unique tags?

In [14]:
print(f"{len(docs_tags.tag.unique()):,} unique tags")

2,674 unique tags


What are the most common tags?

In [15]:
tag_counts = docs_tags.tag.value_counts().reset_index()

tag_counts.head(15)

Unnamed: 0,tag,count
0,DIMME,1640
1,account,759
2,communal,751
3,Illness,691
4,illness letter 969-1517,648
5,Arabic script,629
6,ketubba,551
7,India,538
8,FGP stub,520
9,Halfon b. Menashshe,515


How many tags are used only once?

In [16]:
singletons = tag_counts[tag_counts["count"] == 1]
singletons.head(10)

Unnamed: 0,tag,count
1610,ya'aqov castro,1
1611,al-Ustul,1
1612,pen,1
1613,water lily,1
1614,china,1
1615,al-Razi,1
1616,tibet,1
1617,Epicureanism,1
1618,sama',1
1619,late Heb literary,1


In [17]:
print(f"{singletons.shape[0]:,} tags used only once")

1,064 tags used only once
