In [1]:
import pandas as pd
import numpy as np
import nltk
import requests
import re
import json

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
tab_colors_list = list(mcolors.TABLEAU_COLORS.keys())
import matplotlib.lines as mlines
import seaborn as sns

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

In [42]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
occupations = gc.open_by_url("https://docs.google.com/spreadsheets/d/1nONTEwp42CVnq3iCiONrFbJedIcYtBV-l4Bil5mU7Eo/edit?usp=sharing")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


# Loading datasets

In [3]:
# read edh dataset
publicfolder = "b6b6afdb969d378b70929e86e58ad975"
EDH = sddk.read_file("EDH_text_cleaned_2021-01-21.json", "df", publicfolder)
EDH.head(5)

reading file located in a public folder


Unnamed: 0,responsible_individual,type_of_inscription,letter_size,not_after,literature,work_status,height,diplomatic_text,people,depth,...,clean_text_conservative,clean_text_interpretive_word,clean_text_interpretive_sentence,findspot,year_of_find,present_location,religion,geography,social_economic_legal_history,military
0,Feraudi,epitaph,3.2-2 cm,130,"AE 1983, 0192.; M. Annecchino, Puteoli 4/5, 19...",provisional,33 cm,D M / NONIAE P F OPTATAE / ET C IVLIO ARTEMONI...,"[{'name': 'Noniae P.f. Optatae', 'cognomen': '...",2.7 cm,...,D M Noniae P f Optatae et C Iulio Artemoni par...,Dis Manibus Noniae Publi filiae Optatae et Cai...,Dis Manibus Noniae Publi filiae Optatae et Cai...,,,,,,,
1,Feraudi,epitaph,4 cm,200,"AE 1983, 0080. (A); A. Ferrua, RAL 36, 1981, 1...",no image,28 cm,C SEXTIVS PARIS / QVI VIXIT / ANNIS LXX,"[{'nomen': 'Sextius', 'praenomen': 'C.', 'pers...",,...,C Sextius Paris qui vixit annis LXX,Caius Sextius Paris qui vixit annis LXX,Caius Sextius Paris qui vixit annis LXX,"Via Nomentana, S. Alessandro, Kirche",1937,,,,,
2,Feraudi,honorific inscription,4.5-3 cm,170,"AE 1983, 0518. (B); J. González, ZPE 52, 1983,...",provisional,(37) cm,[ ]VMMIO [ ] / [ ]ISENNA[ ] / [ ] XV[ ] / [ ] / [,"[{'nomen': 'Mummius+', 'gender': 'male', 'prae...",(12) cm,...,ummio isenna Xv,Publio Mummio Publi filio Galeria Sisennae Rut...,Publio Mummio Publi filio Galeria Sisennae Rut...,,before 1975,"Sevilla, Privatbesitz",,,,
3,Gräf,votive inscription,2.5 cm,200,"AE 1983, 0533. (B); A.U. Stylow, Gerión 1, 198...",checked with photo,(39) cm,[ ]AVS[ ]LLA / M PORCI NIGRI SER / DOMINAE VEN...,[{'name': '[---?]AV(?)S(?)[---]L(?)L(?)A M. Po...,18 cm,...,AVSLLA M Porci Nigri ser dominae Veneri aram p...,AVS LLA Marci Porci Nigri serva dominae Veneri...,AVS LLA Marci Porci Nigri serva dominae Veneri...,,before 1979,"Carcabuey, Grupo Escolar",names of pagan deities,,,
4,Feraudi,epitaph,,200,"AE 1983, 0078. (B); A. Ferrua, RAL 36, 1981, 1...",no image,,[ ] L SVCCESSVS / [ ] L L IRENAEVS / [ ] C L T...,"[{'person_id': '1', 'name': '[---]l. Successus...",,...,l Successus L l Irenaeus C l Tyches unt renti f,libertus Successus Luci libertus Irenaeus Cai ...,libertus Successus Luci libertus Irenaeus Cai ...,Via Cupa (ehem. Vigna Nardi),,,,,,


In [4]:
# read edcs dataset
publicfolder = "1f5f56d09903fe259c0906add8b3a55e"
EDCS = sddk.read_file("EDCS_text_cleaned_2021-01-21.json", "df", publicfolder)
EDCS.head(5)

reading file located in a public folder


Unnamed: 0,EDCS-ID,publication,province,place,notes_dating,notes_references,notes_comments,inscription,inscription_stripped14,clean_text_interpretive_word,Links,language,dating from,dating to,status,Latitude,Longitude,photo,Material,Comment
0,EDCS-31400030,"CIL 03, 12297",Achaia,?,,,,Leius,Leius,Leius,,,,,,,,,,
1,EDCS-24700151,"CIL 01, 02650 (p 1097) = IG-05-01, 00741 = ILL...",Achaia,Afesou,,,,// D(ecimi) Leivei D(ecimi) Leivei salve,// D(ecimi) Leivei D(ecimi) Leivei salve,Decimi Leivei Decimi Leivei salve,http://db.edcs.eu/epigr/partner.php?s_language...,GR,,,,,,,,
2,EDCS-24900077,"CIL 01, 00746 (p 944) = D 00867 = ILLRP 00374 ...",Achaia,Agia Triada / Merbaka / Midea,,,,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Quinto Caecilio Cai filio Metello imperatori I...,http://db.edcs.eu/epigr/partner.php?s_language...,,-68.0,-68.0,officium/professio; ordo senatorius; tituli ho...,37.6381128,22.8052991,http://db.edcs.eu/epigr/bilder.php?bilder.php?...,,
3,EDCS-03700724,"ZPE-108-159 = Thesprotia 00001 = AE 1993, 0140...",Achaia,Agios Athanasios / Photike,,,,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et Piissimo Caesari domino nostro G...,http://db.edcs.eu/epigr/partner.php?s_language...,,309.0,313.0,Augusti/Augustae; litterae erasae; ordo equest...,39.4512182,20.7667673,http://db.edcs.eu/epigr/bilder.php?bilder.php?...,,
4,EDCS-55701593,"AE 2009, 01286a",Achaia,Agios Donatos / Photike,,,,Cn(aeus) Atei(us),Cn(aeus) Atei(us),Cnaeus Ateius,,,,,sigilla impressa; tituli fabricationis,39.4759759,20.5069076,,opus figlinae,


# Function to extract occupations

In [10]:
def extract_occup_v1(inscription_text):
    occups_found = []
    for occup in occups_declined_dict.keys():
        for occup_morph in occups_declined_dict[occup]:
            try:
                occup_morph_N = inscription_text.lower().count(occup_morph)
                if occup_morph_N > 0:
                    occups_found.extend([occup] * occup_morph_N)
                    inscription_text = inscription_text.replace(occup_morph, "")
                
            except: pass
    return occups_found

In [11]:
def extract_occup(inscription_text):
    occups_found = []
    try:
        inscription_text = inscription_text.lower()
    except:
        inscription_text = ""
    for occup in occups_declined_dict.keys():
        for occup_morph in occups_declined_dict[occup]:
            try:
                if occup_morph in inscription_text: # first check it this way, otherwise skip
                    occup_morph_N = len(re.findall("(\W|^)" + occup_morph + "(\W|$)", inscription_text))
                    occups_found.extend([occup] * occup_morph_N)
                    inscription_text = inscription_text.replace(occup_morph, "")
            except: pass
    return occups_found

In [43]:
extract_occup("fdfsdf. Lotori,")

[]

# Occupations - extraction

In [44]:
# read declined occupations
with open("../data/occups_declined_dict.json", "r") as fp:
    occups_declined_dict = json.load(fp)

In [45]:
# check that our occupations are properly arranged (from the longest...)
list(occups_declined_dict.keys())[:20]

['exactor auri argenti et aeris',
 'inclusor auri et gemmarum',
 'tesserarius lignarius',
 'refector pectinarius',
 'instructor parietum',
 'tribor argentarius',
 'carrocarpentarius',
 'dactilidiogliphus',
 'manceps salinarum',
 'manceps thermarum',
 'aequator monetae',
 'conchyliolegulus',
 'orchestopolarius',
 'rhypararographus',
 'calciamentarius',
 'chorographarius',
 'diffusor oleari',
 'domnicomontanus',
 'faber tignuarii',
 'medicamentarius']

## Occupations - test on a sample

In [46]:
# sample for testing
EDH_sample = EDH[:1000]
len(EDH_sample)

1000

In [47]:
%%time
EDH_sample["occups"] = EDH_sample["clean_text_interpretive_word"].apply(extract_occup)
EDH_occups_list = [el for sublist in EDH_sample["occups"].tolist() for el in sublist]
len(EDH_occups_list) 
# v1: 3.03s, 946 found
# once we remove after first match, then only 636
# but once we are sensitive to repeated occurences, we get 742
# when looking for within a lower case inscription, the number increases to 902

CPU times: user 849 ms, sys: 6.36 ms, total: 855 ms
Wall time: 851 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


61

In [48]:
EDH_sample["occups_N"] = EDH_sample["occups"].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [49]:
len(EDH_sample[EDH_sample["occups_N"] > 0])

46

In [51]:
# create sample and export it to gsheet
EDH_occups_sample = EDH_sample[EDH_sample["occups_N"] > 0][["id", "clean_text_interpretive_word", "occups", "type_of_inscription_clean"]]
set_with_dataframe(occupations.add_worksheet("EDH_occups_sample", 1, 1), EDH_occups_sample)

## Occupations - application on the whole dataset

In [52]:
%%time
EDH["occups"] = EDH["clean_text_interpretive_word"].apply(extract_occup)

CPU times: user 50.4 s, sys: 103 ms, total: 50.5 s
Wall time: 50.5 s


In [53]:
EDH_occups_list = [el for sublist in EDH["occups"].tolist() for el in sublist]
print(len(EDH_occups_list))
print(nltk.FreqDist(EDH_occups_list).most_common(30))

2745
[('faber', 407), ('curator', 321), ('medicus', 243), ('scriba', 110), ('aerarius', 107), ('agricola', 89), ('pollio', 81), ('negotiator', 79), ('centonarius', 67), ('vexillarius', 59), ('mercator', 45), ('argentarius', 42), ('mensor', 39), ('cerdo', 36), ('cornicen', 28), ('arcarius', 26), ('conditor', 26), ('architectus', 23), ('nauta', 22), ('dendrophorus', 19), ('tignarius', 18), ('structor', 16), ('ornatrix', 16), ('lapidarius', 16), ('vestiarius', 15), ('sutor', 15), ('venator', 15), ('forensis', 14), ('scutarius', 14), ('coactor', 13)]


first results (for comparison):

```[('curator', 2833), ('ustor', 2092), ('plastes', 1076), ('psaltes', 866), ('cerdo', 730), ('uctor', 592), ('faber', 458), ('promus', 320), ('sagittarius', 236), ('erarius', 217), ('scriba', 194), ('negotiator', 191), ('medicus', 137), ('tector', 136), ('centonarius', 132), ('emporus', 113), ('gladiator', 112), ('aerarius', 104), ('marmorarienses', 102), ('conditor', 76), ('vexillarius', 74), ('lanius', 72), ('scaenicus', 71), ('tesserarius', 70), ('argentarius', 65), ('mensor', 62), ('scriptor', 61), ('figlus', 56), ('nauta', 53), ('cellio', 52)]```

In [54]:
EDH["occups_N"] = EDH["occups"].apply(len)

In [55]:
%%time
EDCS["occups"] = EDCS["clean_text_interpretive_word"].apply(extract_occup)

CPU times: user 3min 43s, sys: 169 ms, total: 3min 43s
Wall time: 3min 43s


In [56]:
EDCS_occups_list = [el for sublist in EDCS["occups"].tolist() for el in sublist]
print(len(EDCS_occups_list))
print(nltk.FreqDist(EDCS_occups_list).most_common(30))

10085
[('faber', 1152), ('curator', 915), ('medicus', 668), ('aerarius', 513), ('scriba', 392), ('cocus', 289), ('vexillarius', 260), ('cerdo', 254), ('pollio', 245), ('agricola', 225), ('mercator', 217), ('centonarius', 217), ('argentarius', 170), ('negotiator', 159), ('figulus', 138), ('mensor', 136), ('vestiarius', 104), ('dendrophorus', 91), ('lanius', 89), ('conditor', 89), ('copo', 86), ('architectus', 81), ('arcarius', 79), ('nauta', 68), ('vinarius', 59), ('structor', 58), ('ornatrix', 56), ('fullo', 52), ('pistor', 52), ('sutor', 48)]


previous version
```40509
[('plastes', 5934), ('curator', 4368), ('figlus', 4251), ('cerdo', 2905), ('ustor', 2863), ('psaltes', 2205), ('faber', 2162), ('pollio', 907), ('medicus', 723), ('uctor', 671), ('scriba', 558), ('mercator', 546), ('aerarius', 517), ('promus', 485), ('cocus', 437), ('figulus', 419), ('sagittarius', 376), ('marmorarienses', 360), ('lanius', 334), ('emporus', 316), ('agricola', 310), ('fullo', 279), ('vexillarius', 273), ('gladiator', 248), ('restio', 246), ('aedifex', 245), ('negotiator', 244), ('vitor', 242), ('tector', 230), ('centonarius', 220)]
```

In [57]:
EDCS["occups_N"] = EDCS["occups"].apply(len)

## Occupations - overview

In [58]:
print("EDH - number of occupation occurances: " + str(EDH["occups_N"].sum()))
print("EDH - number of inscriptions with at least one occupation mentioned: " + str(len(EDH[EDH["occups_N"] > 0])))
print("EDH - number of dated inscriptions with at least one occupation mentioned: "+ str(len(EDH[(EDH["not_before"].notnull()) & (EDH["occups_N"] > 0)])))
print("EDH - number of occupation occurances in dated inscriptions: " + str(EDH[EDH["not_before"].notnull()]["occups_N"].sum()))

EDH - number of occupation occurances: 2745
EDH - number of inscriptions with at least one occupation mentioned: 2336
EDH - number of dated inscriptions with at least one occupation mentioned: 1529
EDH - number of occupation occurances in dated inscriptions: 1788


In [59]:
print("EDCS - number of occupation occurances: " + str(EDCS["occups_N"].sum()))
print("EDCS - number of inscriptions with at least one occupation mentioned: " + str(len(EDCS[EDCS["occups_N"] > 0])))
print("EDCS - number of dated inscriptions with at least one occupation mentioned: "+ str(len(EDCS[(EDCS["dating to"].notnull()) & (EDCS["occups_N"] > 0)])))
print("EDCS - number of occupation occurances in dated inscriptions: " + str(EDCS[EDCS["dating to"].notnull()]["occups_N"].sum()))

EDCS - number of occupation occurances: 10085
EDCS - number of inscriptions with at least one occupation mentioned: 8513
EDCS - number of dated inscriptions with at least one occupation mentioned: 4771
EDCS - number of occupation occurances in dated inscriptions: 5748


## Saving to Sciencedata

In [61]:
# login to our project folder, owned by my AU account 648597@au.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/


In [63]:
sddk.write_file("SDAM_data/social_diversity/EDH_occups_2021-02-19.json", EDH, conf)
sddk.write_file("SDAM_data/social_diversity/EDCS_occups_2021-02-19.json", EDCS, conf)

A file with the same name ("EDH_occups_2021-02-19.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/social_diversity/EDH_occups_2021-02-19.json"
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/social_diversity/EDCS_occups_2021-02-19.json"


# Organizations - extraction

In [22]:
# read declined occupations
with open("../data/organizations_declined_dict.json", "r") as fp:
    occups_declined_dict = json.load(fp)

In [23]:
# check that our occupations are properly arranged (from the longest...)
list(occups_declined_dict.keys())[:15]

['collegatarius',
 'collegiarius',
 'collegiatus',
 'corporatus',
 'sodalicium',
 'collegium',
 'collegius',
 'concilium',
 'conlegium',
 'sodalitas',
 'colegium',
 'sociatio',
 'societas',
 'collega',
 'corpus']

## Organizations - test on a sample data

In [37]:
# sample for testing
EDH_sample = EDH[:1000]
len(EDH_sample)

1000

In [24]:
%%time
EDH_sample["organizations"] = EDH_sample["clean_text_interpretive_word"].apply(extract_occup)
EDH_organizations_list = [el for sublist in EDH_sample["organizations"].tolist() for el in sublist]
len(EDH_organizations_list) 
# v1: 3.03s, 946 found
# once we remove after first match, then only 636
# but once we are sensitive to repeated occurences, we get 742
# when looking for within a lower case inscription, the number increases to 902

CPU times: user 31.8 ms, sys: 597 µs, total: 32.4 ms
Wall time: 66.3 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


52

In [27]:
EDH_sample["organizations_N"] = EDH_sample["organizations"].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
len(EDH_sample[EDH_sample["organizations_N"] > 0])

20

In [29]:
# create sample and export it to gsheet
EDH_occups_sample = EDH_sample[EDH_sample["organizations_N"] > 0][["id", "clean_text_interpretive_word", "organizations", "type_of_inscription_clean"]]
set_with_dataframe(occupations.add_worksheet("EDH_orgs_sample", 1, 1), EDH_occups_sample)

## Organizations - application to full datasets

In [30]:
%%time
EDH["organizations"] = EDH["clean_text_interpretive_word"].apply(extract_occup)

CPU times: user 1.28 s, sys: 54.7 ms, total: 1.33 s
Wall time: 1.33 s


In [31]:
EDH_organizations_list = [el for sublist in EDH["organizations"].tolist() for el in sublist]
print(len(EDH_organizations_list))
print(nltk.FreqDist(EDH_organizations_list).most_common(30))

934
[('collegium', 480), ('corpus', 294), ('collega', 66), ('societas', 30), ('conlegium', 15), ('collegiatus', 13), ('concilium', 12), ('collegius', 10), ('corporatus', 8), ('sodalicium', 5), ('colegium', 1)]


In [32]:
EDH["organizations_N"] = EDH["organizations"].apply(len)

In [33]:
%%time
EDCS["organizations"] = EDCS["clean_text_interpretive_word"].apply(extract_occup)

CPU times: user 5.42 s, sys: 233 ms, total: 5.65 s
Wall time: 5.65 s


In [34]:
EDCS_organizations_list = [el for sublist in EDCS["organizations"].tolist() for el in sublist]
print(len(EDCS_organizations_list))
print(nltk.FreqDist(EDCS_organizations_list).most_common(30))

3184
[('collegium', 1633), ('corpus', 984), ('collega', 208), ('colegium', 84), ('societas', 76), ('corporatus', 62), ('concilium', 42), ('collegius', 34), ('sodalicium', 27), ('collegiatus', 23), ('conlegium', 9), ('sodalitas', 2)]


In [35]:
EDCS["organizations_N"] = EDCS["organizations"].apply(len)

## Organizations - overview

In [38]:
print("EDH - number of organization occurances: " + str(EDH["organizations_N"].sum()))
print("EDH - number of inscriptions with at least one organization mentioned: " + str(len(EDH[EDH["organizations_N"] > 0])))
print("EDH - number of dated inscriptions with at least one organization mentioned: "+ str(len(EDH[(EDH["not_before"].notnull()) & (EDH["organizations_N"] > 0)])))
print("EDH - number of organization occurances in dated inscriptions: " + str(EDH[EDH["not_before"].notnull()]["organizations_N"].sum()))

EDH - number of organization occurances: 934
EDH - number of inscriptions with at least one organization mentioned: 778
EDH - number of dated inscriptions with at least one organization mentioned: 565
EDH - number of organization occurances in dated inscriptions: 676


In [39]:
print("EDCS - number of organizations occurances: " + str(EDCS["organizations_N"].sum()))
print("EDCS - number of inscriptions with at least one organizations mentioned: " + str(len(EDCS[EDCS["organizations_N"] > 0])))
print("EDCS - number of dated inscriptions with at least one organizations mentioned: "+ str(len(EDCS[(EDCS["dating to"].notnull()) & (EDCS["organizations_N"] > 0)])))
print("EDCS - number of organizations occurances in dated inscriptions: " + str(EDCS[EDCS["dating to"].notnull()]["organizations_N"].sum()))

EDCS - number of organizations occurances: 3184
EDCS - number of inscriptions with at least one organizations mentioned: 2418
EDCS - number of dated inscriptions with at least one organizations mentioned: 1611
EDCS - number of organizations occurances in dated inscriptions: 2254


## Saving to Sciencedata

In [40]:
# login to our project folder, owned by my AU account 648597@au.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/


In [41]:
sddk.write_file("SDAM_data/social_diversity/EDH_organizations_2021-02-19.json", EDH, conf)
sddk.write_file("SDAM_data/social_diversity/EDCS_organizations_2021-02-19.json", EDCS, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/social_diversity/EDH_organizations_2021-02-19.json"
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/social_diversity/EDCS_organizations_2021-02-19.json"
