## Scholarship records

In [12]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths

sources = pd.read_csv(pgp_csv_paths["sources"])
sources.head()

Unnamed: 0,source_type,authors,title,journal_book,volume,issue,year,place_published,publisher,edition,other_info,page_range,languages,url,notes,citation,num_footnotes
0,Unpublished,"Masback, Grace",,,,,2020.0,,,,,,,,Created from PGPID 17049,"Grace Masback, (2020).",1
1,Unpublished,"Perez, Idan",,,,,,,,,,,,,Created from PGPID 31295,Idan Perez.,3
2,Unpublished,"Diem, Werner",,,,,,,,,,,German,,Created from PGPID 31584,Werner Diem.,5
3,Unpublished,"Cobb, Paul ; Rustow, Marina",,,,,,,,,,,,,Created from PGPID 30837,Paul Cobb and Marina Rustow.,1
4,Unpublished,"Khan, Geoffrey ; Rustow, Marina",,,,,,,,,,,,,Created from PGPID 30838,Geoffrey Khan and Marina Rustow.,2


In [13]:
print(f"{sources.shape[0]:,} sources")

679 sources


In [14]:
sources.source_type.value_counts().reset_index()

Unnamed: 0,source_type,count
0,Unpublished,264
1,Article,245
2,Book,94
3,Book Section,55
4,Dissertation,20
5,Blog,1


In [15]:
source_totals = sources.groupby('source_type').agg(
    count=pd.NamedAgg(column="source_type", aggfunc="count"),
    num_footnotes=pd.NamedAgg(column='num_footnotes', aggfunc='sum')
    ).reset_index()
source_totals = source_totals.sort_values('count', ascending=False)
source_totals = source_totals.rename(columns={"source_type": "Type", "count": "Sources", "num_footnotes": "Footnotes"})
source_totals

Unnamed: 0,Type,Sources,Footnotes
5,Unpublished,264,14908
0,Article,245,665
2,Book,94,6834
3,Book Section,55,217
4,Dissertation,20,934
1,Blog,1,1


In [16]:
print(source_totals.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
Type & Sources & Footnotes \\
\midrule
Unpublished & 264 & 14908 \\
Article & 245 & 665 \\
Book & 94 & 6834 \\
Book Section & 55 & 217 \\
Dissertation & 20 & 934 \\
Blog & 1 & 1 \\
\bottomrule
\end{tabular}



In [17]:
# how many documents do sources relate to?
# goitein sources - split out into volumes to wrangle the scale
# what does it look like if we leave Goitein out?

sources.num_footnotes.value_counts().reset_index().sort_values("num_footnotes").head(10)

Unnamed: 0,num_footnotes,count
4,0,22
0,1,231
1,2,115
2,3,47
3,4,33
5,5,21
6,6,16
8,7,10
9,8,9
11,9,7


In [18]:
source_authors = sources.copy()
source_authors["author_list"] = sources.authors.apply(lambda x : x.split(';') if pd.notna(x) else None)
source_authors.head()
source_authors = source_authors[['source_type', 'title', 'author_list']].explode("author_list").rename(columns={"author_list": "author"})
# filter out unset authors
source_authors = source_authors[source_authors.author.notna()]

source_authors["author"] = source_authors.author.apply(lambda x: x.strip())
source_authors.head(10)

Unnamed: 0,source_type,title,author
0,Unpublished,,"Masback, Grace"
1,Unpublished,,"Perez, Idan"
2,Unpublished,,"Diem, Werner"
3,Unpublished,,"Cobb, Paul"
3,Unpublished,,"Rustow, Marina"
4,Unpublished,,"Khan, Geoffrey"
4,Unpublished,,"Rustow, Marina"
5,Unpublished,,"Zinger, Oded"
6,Unpublished,,"Marglin, J."
7,Unpublished,,"Margariti, R. E."


In [19]:
print(f"{len(source_authors.author.unique()):,} unique authors")

183 unique authors


In [20]:
source_authors.author.value_counts().head(12)

author
Goitein, S. D.               197
Elbaum, Alan                  64
Rustow, Marina                61
Umrethwala, Yusuf             42
Friedman, Mordechai Akiva     41
Gil, Moshe                    19
Ashur, Amir                   18
Khan, Geoffrey                16
Vanthieghem, Naïm             14
Zinger, Oded                  13
David, Avraham                12
Cohen, Mark                   11
Name: count, dtype: int64