#  Selection of inscriptions containing road-related terms

**Research idea & domain expertise:** Petra Hermankova, Aarhus University

**Script & technical solution:** Vojtech Kase, Aarhus University

Source: https://github.com/sdam-au/social_diversity/

In [1]:
!pip install nltk gspread sddk gspread_dataframe kaleido
import pandas as pd
import numpy as np
import nltk
import requests
import re
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
tab_colors_list = list(mcolors.TABLEAU_COLORS.keys())
import matplotlib.lines as mlines
import seaborn as sns

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk



In [2]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
terms = gc.open_by_url("https://docs.google.com/spreadsheets/d/1tdtjPCoHY61FSZB0CxAdZXN9xDgl76KU-ObMp4uNG2A/edit#gid=0")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
# read edh dataset
publicfolder = "b6b6afdb969d378b70929e86e58ad975"
EDH = sddk.read_file("EDH_text_cleaned_2021-01-21.json", "df", publicfolder)
EDH.head(5)

reading file located in a public folder


Unnamed: 0,responsible_individual,type_of_inscription,letter_size,not_after,literature,work_status,height,diplomatic_text,people,depth,...,clean_text_conservative,clean_text_interpretive_word,clean_text_interpretive_sentence,findspot,year_of_find,present_location,religion,geography,social_economic_legal_history,military
0,Feraudi,epitaph,3.2-2 cm,130,"AE 1983, 0192.; M. Annecchino, Puteoli 4/5, 19...",provisional,33 cm,D M / NONIAE P F OPTATAE / ET C IVLIO ARTEMONI...,"[{'name': 'Noniae P.f. Optatae', 'cognomen': '...",2.7 cm,...,D M Noniae P f Optatae et C Iulio Artemoni par...,Dis Manibus Noniae Publi filiae Optatae et Cai...,Dis Manibus Noniae Publi filiae Optatae et Cai...,,,,,,,
1,Feraudi,epitaph,4 cm,200,"AE 1983, 0080. (A); A. Ferrua, RAL 36, 1981, 1...",no image,28 cm,C SEXTIVS PARIS / QVI VIXIT / ANNIS LXX,"[{'nomen': 'Sextius', 'praenomen': 'C.', 'pers...",,...,C Sextius Paris qui vixit annis LXX,Caius Sextius Paris qui vixit annis LXX,Caius Sextius Paris qui vixit annis LXX,"Via Nomentana, S. Alessandro, Kirche",1937,,,,,
2,Feraudi,honorific inscription,4.5-3 cm,170,"AE 1983, 0518. (B); J. González, ZPE 52, 1983,...",provisional,(37) cm,[ ]VMMIO [ ] / [ ]ISENNA[ ] / [ ] XV[ ] / [ ] / [,"[{'nomen': 'Mummius+', 'gender': 'male', 'prae...",(12) cm,...,ummio isenna Xv,Publio Mummio Publi filio Galeria Sisennae Rut...,Publio Mummio Publi filio Galeria Sisennae Rut...,,before 1975,"Sevilla, Privatbesitz",,,,
3,Gräf,votive inscription,2.5 cm,200,"AE 1983, 0533. (B); A.U. Stylow, Gerión 1, 198...",checked with photo,(39) cm,[ ]AVS[ ]LLA / M PORCI NIGRI SER / DOMINAE VEN...,[{'name': '[---?]AV(?)S(?)[---]L(?)L(?)A M. Po...,18 cm,...,AVSLLA M Porci Nigri ser dominae Veneri aram p...,AVS LLA Marci Porci Nigri serva dominae Veneri...,AVS LLA Marci Porci Nigri serva dominae Veneri...,,before 1979,"Carcabuey, Grupo Escolar",names of pagan deities,,,
4,Feraudi,epitaph,,200,"AE 1983, 0078. (B); A. Ferrua, RAL 36, 1981, 1...",no image,,[ ] L SVCCESSVS / [ ] L L IRENAEVS / [ ] C L T...,"[{'person_id': '1', 'name': '[---]l. Successus...",,...,l Successus L l Irenaeus C l Tyches unt renti f,libertus Successus Luci libertus Irenaeus Cai ...,libertus Successus Luci libertus Irenaeus Cai ...,Via Cupa (ehem. Vigna Nardi),,,,,,


In [4]:
# read edcs dataset
publicfolder = "1f5f56d09903fe259c0906add8b3a55e"
EDCS = sddk.read_file("EDCS_text_cleaned_2021-01-21.json", "df", publicfolder)
EDCS.head(5)

reading file located in a public folder


Unnamed: 0,EDCS-ID,publication,province,place,notes_dating,notes_references,notes_comments,inscription,inscription_stripped14,clean_text_interpretive_word,Links,language,dating from,dating to,status,Latitude,Longitude,photo,Material,Comment
0,EDCS-31400030,"CIL 03, 12297",Achaia,?,,,,Leius,Leius,Leius,,,,,,,,,,
1,EDCS-24700151,"CIL 01, 02650 (p 1097) = IG-05-01, 00741 = ILL...",Achaia,Afesou,,,,// D(ecimi) Leivei D(ecimi) Leivei salve,// D(ecimi) Leivei D(ecimi) Leivei salve,Decimi Leivei Decimi Leivei salve,http://db.edcs.eu/epigr/partner.php?s_language...,GR,,,,,,,,
2,EDCS-24900077,"CIL 01, 00746 (p 944) = D 00867 = ILLRP 00374 ...",Achaia,Agia Triada / Merbaka / Midea,,,,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Quinto Caecilio Cai filio Metello imperatori I...,http://db.edcs.eu/epigr/partner.php?s_language...,,-68.0,-68.0,officium/professio; ordo senatorius; tituli ho...,37.6381128,22.8052991,http://db.edcs.eu/epigr/bilder.php?bilder.php?...,,
3,EDCS-03700724,"ZPE-108-159 = Thesprotia 00001 = AE 1993, 0140...",Achaia,Agios Athanasios / Photike,,,,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et Piissimo Caesari domino nostro G...,http://db.edcs.eu/epigr/partner.php?s_language...,,309.0,313.0,Augusti/Augustae; litterae erasae; ordo equest...,39.4512182,20.7667673,http://db.edcs.eu/epigr/bilder.php?bilder.php?...,,
4,EDCS-55701593,"AE 2009, 01286a",Achaia,Agios Donatos / Photike,,,,Cn(aeus) Atei(us),Cn(aeus) Atei(us),Cnaeus Ateius,,,,,sigilla impressa; tituli fabricationis,39.4759759,20.5069076,,opus figlinae,


In [5]:
# read declined terms
with open("../data/terms_declined_dict.json", "r") as fp:
    terms_declined_dict = json.load(fp)

In [6]:
# check that our terms are properly arranged (from the longest...)
list(terms_declined_dict.keys())[:20]

['millia passuum',
 'curator viarum',
 'deverticulum',
 'diverticulum',
 'tabellarium',
 'itinerarium',
 'vehiculatio',
 'caput viae',
 'milliarium',
 'angiportus',
 'tabelarium',
 'miliarium',
 'carpentum',
 'vehiculum',
 'incessus',
 'iumentum',
 'compitum',
 'terminus',
 'mutatio',
 'gressus']

#  Function to extract terms

In [7]:
def extract_terms_v1(inscription_text):
    terms_found = []
    for terms in terms_declined_dict.keys():
        for terms_morph in terms_declined_dict[terms]:
            try:
                terms_morph_N = inscription_text.lower().count(terms_morph)
                if terms_morph_N > 0:
                    terms_found.extend([terms] * terms_morph_N)
                    inscription_text = inscription_text.replace(terms_morph, "")
                
            except: pass
    return terms_found

In [8]:
def extract_terms_v2(inscription_text):
    terms_found = []
    try:
        inscription_text = inscription_text.lower()
    except:
        inscription_text = ""
    for terms in terms_declined_dict.keys():
        for terms_morph in terms_declined_dict[terms]:
            try:
                if terms_morph in inscription_text: # first check it this way, otherwise skip
                    terms_morph_N = len(re.findall("(\W|^)" + terms_morph + "(\W|$)", inscription_text))
                    terms_found.extend([terms] * terms_morph_N)
                    inscription_text = inscription_text.replace(terms_morph, "")
            except: pass
    return terms_found

In [29]:
def extract_terms(inscription_text):
    terms_found = []
    if not isinstance(inscription_text, str): # if not valid string
        inscription_text = ""
    for terms in terms_declined_dict.keys():
        for terms_morph in terms_declined_dict[terms]:
            try:
                if terms_morph in inscription_text: # first check it this way, otherwise skip
                    terms_morph_N = len(re.findall("(\W|^)" + terms_morph + "(\W|$)", inscription_text))
                    terms_found.extend([terms] * terms_morph_N)
                    inscription_text = inscription_text.replace(terms_morph, "")
            except: pass
    return terms_found

In [32]:
extract_terms("fdfsdf. miliaria,")

['miliarium']

# Test on sample

In [33]:
# sample for testing
EDH_sample = EDH[:1000]
len(EDH_sample)

1000

In [34]:
%%time
EDH_sample["terms"] = EDH_sample["clean_text_interpretive_word"].apply(extract_terms)
EDH_terms_list = [el for sublist in EDH_sample["terms"].tolist() for el in sublist]
len(EDH_terms_list) 

CPU times: user 55.1 ms, sys: 4.31 ms, total: 59.4 ms
Wall time: 56.7 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


16

In [35]:
EDH_sample["terms_N"] = EDH_sample["terms"].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
len(EDH_sample[EDH_sample["terms_N"] > 0])

12

In [37]:
# create sample and export it to gsheet
EDH_terms_sample = EDH_sample[EDH_sample["terms_N"] > 0][["id", "clean_text_interpretive_word", "terms", "type_of_inscription_clean"]]
set_with_dataframe(terms.add_worksheet("EDH_terms_sample", 1, 1), EDH_terms_sample)

APIError: {'code': 400, 'message': 'Invalid requests[0].addSheet: A sheet with the name "EDH_terms_sample" already exists. Please enter another name.', 'status': 'INVALID_ARGUMENT'}

# Application to the whole dataset (EDH)

In [38]:
%%time
EDH["terms"] = EDH["clean_text_interpretive_word"].apply(extract_terms)

CPU times: user 3.05 s, sys: 37.4 ms, total: 3.09 s
Wall time: 3.08 s


In [39]:
EDH_terms_list = [el for sublist in EDH["terms"].tolist() for el in sublist]
print(len(EDH_terms_list))
print(nltk.FreqDist(EDH_terms_list).most_common(30))

1593
[('via', 332), ('miliarium', 288), ('milliarium', 282), ('vicus', 211), ('viator', 120), ('iter', 57), ('arcus', 54), ('leuga', 53), ('terminus', 42), ('statio', 30), ('porta', 22), ('curator viarum', 20), ('gradus', 16), ('cursus', 13), ('iumentum', 12), ('pons', 8), ('compitum', 7), ('clivus', 5), ('millia passuum', 5), ('vehiculum', 4), ('scala', 3), ('semita', 3), ('actus', 2), ('mansio', 2), ('tabellarium', 1), ('deverticulum', 1)]


In [40]:
EDH["terms_N"] = EDH["terms"].apply(len)

# Application to the whole dataset (EDCS)

In [41]:
%%time
EDCS["terms"] = EDCS["clean_text_interpretive_word"].apply(extract_terms)

CPU times: user 12 s, sys: 32.8 ms, total: 12.1 s
Wall time: 12.1 s


In [42]:
EDCS_terms_list = [el for sublist in EDCS["terms"].tolist() for el in sublist]
print(len(EDCS_terms_list))
print(nltk.FreqDist(EDCS_terms_list).most_common(30))

4388
[('via', 1285), ('miliarium', 1062), ('vicus', 572), ('viator', 381), ('statio', 215), ('iter', 178), ('arcus', 142), ('terminus', 110), ('porta', 67), ('cursus', 59), ('curator viarum', 56), ('gradus', 52), ('pons', 30), ('compitum', 25), ('actus', 25), ('clivus', 21), ('iumentum', 20), ('scala', 17), ('leuga', 17), ('tabellarium', 13), ('semita', 13), ('vehiculum', 7), ('gressus', 5), ('carpentum', 5), ('passus', 4), ('mansio', 3), ('deverticulum', 2), ('angiportus', 1), ('incessus', 1)]


In [43]:
EDCS["terms_N"] = EDCS["terms"].apply(len)

# Terms overview

In [44]:
print("EDH - number of term occurances: " + str(EDH["terms_N"].sum()))
print("EDH - number of inscriptions with at least one term mentioned: " + str(len(EDH[EDH["terms_N"] > 0])))
print("EDH - number of dated inscriptions with at least one term mentioned: "+ str(len(EDH[(EDH["not_before"].notnull()) & (EDH["terms_N"] > 0)])))
print("EDH - number of term occurances in dated inscriptions: " + str(EDH[EDH["not_before"].notnull()]["terms_N"].sum()))

EDH - number of term occurances: 1593
EDH - number of inscriptions with at least one term mentioned: 1169
EDH - number of dated inscriptions with at least one term mentioned: 815
EDH - number of term occurances in dated inscriptions: 1092


In [45]:
print("EDCS - number of term occurances: " + str(EDCS["terms_N"].sum()))
print("EDCS - number of inscriptions with at least one term mentioned: " + str(len(EDCS[EDCS["terms_N"] > 0])))
print("EDCS - number of dated inscriptions with at least one term mentioned: "+ str(len(EDCS[(EDCS["dating to"].notnull()) & (EDCS["terms_N"] > 0)])))
print("EDCS - number of term occurances in dated inscriptions: " + str(EDCS[EDCS["dating to"].notnull()]["terms_N"].sum()))

EDCS - number of term occurances: 4388
EDCS - number of inscriptions with at least one term mentioned: 3416
EDCS - number of dated inscriptions with at least one term mentioned: 2137
EDCS - number of term occurances in dated inscriptions: 2943


# Saving to Sciencedata

In [46]:
# login to our project folder, owned by my AU account 648597@au.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its ordinary user
endpoint variable has been configured to: https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/


In [47]:
sddk.write_file("SDAM_data/epigraphic_roads/EDH_terms_2021-02-26.json", EDH, conf)
sddk.write_file("SDAM_data/epigraphic_roads/EDCS_terms_2021-02-26.json", EDCS, conf)

A file with the same name ("EDH_terms_2021-02-26.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/epigraphic_roads/EDH_terms_2021-02-26.json"
A file with the same name ("EDCS_terms_2021-02-26.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/sharingout/648597%40au.dk/SDAM_root/SDAM_data/epigraphic_roads/EDCS_terms_2021-02-26.json"
