In [1]:
!pip install nltk gspread sddk gspread_dataframe kaleido
# REQUIREMNTS
import pandas as pd
import nltk
import requests
import re
import json

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk



In [2]:
# to access gsheet (from where we read our occupation data), you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
terms = gc.open_by_url("https://docs.google.com/spreadsheets/d/1tdtjPCoHY61FSZB0CxAdZXN9xDgl76KU-ObMp4uNG2A/edit?usp=sharing")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [13]:
# read data with the list of terms
terms_raw = get_as_dataframe(terms.worksheet("Vocabulary_Latin"))
terms_raw = terms_raw[terms_raw['Term_LAT'].notnull()]
terms_df = terms_raw[terms_raw.columns[:8]]
terms_df

Unnamed: 0,Evidence,Category,Term_ENG,Term_LAT,Gen_sg,Vocab_nom_sg,LewisShort_dictionary,LewisShort_exact_term
0,Direct,Physical description of features,Road,via,ae,,"vĭa, ae, f.","a highway, road, path, street"
1,Direct,Physical description of features,Bridge,pons,ntis,,"pons, ntis, m.","a bridge across a river, ditch, or marsh, betw..."
2,Direct,Physical description of features,Exchange station,mutatio,onis,,"mūtātĭo, ōnis, f.",a changing or change of horses
3,Direct,Physical description of features,Lodging,mansio,onis,,"mansĭo, ōnis, f.","Night-quarters, lodging-place, inn; also, as a..."
4,Direct,Road maintenance,Administrative unit responsible for the road m...,caput viae,itis,caput,"căpŭt, ĭtis, n.","the origin, source, spring (head), root"
5,Direct,Road maintenance,Milestone,miliarium,i,,"mīlĭārĭum (mill-), ii, n.",a mile-stone (which indicated a distance of a ...
6,Direct,Road maintenance,Milestone,milliarium,i,,"mīlĭārĭum (mill-), ii, n.",a mile-stone (which indicated a distance of a ...
7,Direct,Unit of distance,One mile = 1000 paces,millia passuum,ae,millia,,
8,Indirect,Unit of distance,Pace,passus,us,,"passus, ūs, m.","a pace, as a measure of length, consisting of ..."
9,Indirect,Mobility,Cart,carpentum,i,,"carpentum, i, n.","a two-wheeled, covered carriage, coach, or cha..."


In [16]:
terms_tups = [(terms_nom, terms_gen, word_to_dec) for terms_nom, terms_gen, word_to_dec in zip(terms_df["Term_LAT"].tolist(), terms_df["Gen_sg"].tolist(), terms_df["Vocab_nom_sg"].tolist())]
terms_tups[:10]

[('via', 'ae', nan),
 ('pons', 'ntis', nan),
 ('mutatio', 'onis', nan),
 ('mansio', 'onis', nan),
 ('caput viae', 'itis', 'caput'),
 ('miliarium', 'i', nan),
 ('milliarium', 'i', nan),
 ('millia passuum', 'ae', 'millia'),
 ('passus', 'us', nan),
 ('carpentum', 'i', nan)]

In [17]:
# reorder from longest to shortest
terms_tups = sorted(terms_tups, key = lambda x: len(x[0]), reverse=True)
terms_tups[:5]

[('millia passuum', 'ae', 'millia'),
 ('curator viarum', 'oris', 'curator'),
 ('deverticulum', 'i', nan),
 ('diverticulum', 'i', nan),
 ('tabellarium', 'i', nan)]

In [18]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "orum", "os", "is"],
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ros", "rorum", "ris"],

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5" : ["or", "oris", "orem", "ori", "ore", "ores", "oribus", "orum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}


In [19]:
terms_tups = sorted(terms_tups, key = lambda x: len(x[0]), reverse=True)
terms_tups[:5]


[('millia passuum', 'ae', 'millia'),
 ('curator viarum', 'oris', 'curator'),
 ('deverticulum', 'i', nan),
 ('diverticulum', 'i', nan),
 ('tabellarium', 'i', nan)]

In [20]:
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = list(set([str(root) + end for end in decs[pos_dec]])) # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 
terms_declined = []
for terms_tup in terms_tups:
    if re.match("\w+\s\w+", terms_tup[0]):
        all_morphs = [terms_tup[0].replace(str(terms_tup[2]), morph) for morph in decline(terms_tup[2], terms_tup[1])] 
    else:
        all_morphs = decline(terms_tup[0], terms_tup[1])
    terms_declined.append([str(terms_tup[0]), all_morphs])

In [21]:
terms_declined_dict = dict(terms_declined)

In [22]:
with open("../data/terms_declined_dict.json", "w") as fp:
    json.dump(terms_declined_dict, fp, indent=4)

In [23]:
#uncomment for export to google sheets:
terms_declined_df = pd.DataFrame(terms_declined)
set_with_dataframe(terms.add_worksheet("terms_decl", 1, 1), terms_declined_df)