In [1]:
from IPython.display import Javascript
Javascript("Jupyter.notebook.session.restart({kernel_name: 'socdiv_venv'})")
print("kernel changed")

kernel changed


In [1]:
# REQUIREMNTS
import pandas as pd
import nltk
import requests
import re
import json

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

In [2]:
# to access gsheet (from where we read our occupation data), you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
occupations = gc.open_by_url("https://docs.google.com/spreadsheets/d/1nONTEwp42CVnq3iCiONrFbJedIcYtBV-l4Bil5mU7Eo/edit?usp=sharing")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
# read occupation data
occupations_raw = get_as_dataframe(occupations.worksheet("Occupation"))
occupations_raw = occupations_raw[occupations_raw['Term'].notnull()]
occupations_df = occupations_raw[occupations_raw.columns[:6]]
occupations_df

Unnamed: 0,Term,gen_sg,Vocab_nom_sg,Source,Category,Translation_eng
0,abetarius,i,,Petrikovits 1981a,Building,"a joiner, wood worker"
1,abietarius,i,,Petrikovits 1981a,Building,"a joiner, wood worker"
2,acceptor,oris,acceptor,Waltzing - Rome,Finance,"collector, gold quality checker"
3,accomodator,oris,,Petrikovits 1981a,Unclear meaning,"uncertain, craftsman"
4,aceptor,oris,,Petrikovits 1981a,Finance,"collector, gold quality checker"
...,...,...,...,...,...,...
809,vitor,oris,,Petrikovits 1981a,Household Goods,basket weaver
810,vitrarius,i,,Petrikovits 1981b,Glass-Working,glass manufacturer
811,vitrearius,i,,Petrikovits 1981a,Glass-Working,glass manufacturer
812,vitriarius,i,,Petrikovits 1981a,Glass-Working,glass manufacturer


In [4]:
occup_tups = [(occup_nom, occup_gen, word_to_dec) for occup_nom, occup_gen, word_to_dec in zip(occupations_df["Term"].tolist(), occupations_df["gen_sg"].tolist(), occupations_df["Vocab_nom_sg"].tolist())]
occup_tups[:5]

[('abetarius', 'i', nan),
 ('abietarius', 'i', nan),
 ('acceptor', 'oris', 'acceptor'),
 ('accomodator', 'oris', nan),
 ('aceptor', 'oris', nan)]

In [5]:
# reorder from longest to shortest
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('exactor auri argenti et aeris', 'oris', 'exactor'),
 ('inclusor auri et gemmarum', 'oris', 'inclusor'),
 ('tesserarius lignarius', 'i', 'tesserarius'),
 ('refector pectinarius', 'oris', 'refector'),
 ('instructor parietum', 'oris', 'instructor')]

In [6]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "orum", "os", "is"],
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ros", "rorum", "ris"],

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5a" : ["or", "oris", "orem", "ori", "ore", "ores", "oribus", "orum"],
"third_m_5b" : ["ur", "uris", "urem", "uri", "ure", "ures", "uribus", "urum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7a" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_7b" : ["esis", "esis", "esem", "esi", "ese", "eses", "esibus", "esum", "esium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
"third_m_13" : ["us", "oris", "ori", "ore", "ora", "orum", "oribus"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}

In [7]:
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('exactor auri argenti et aeris', 'oris', 'exactor'),
 ('inclusor auri et gemmarum', 'oris', 'inclusor'),
 ('tesserarius lignarius', 'i', 'tesserarius'),
 ('refector pectinarius', 'oris', 'refector'),
 ('instructor parietum', 'oris', 'instructor')]

In [8]:
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = list(set([str(root) + end for end in decs[pos_dec]])) # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 
occups_declined = []
for occup_tup in occup_tups:
    if re.match("\w+\s\w+", occup_tup[0]):
        all_morphs = [occup_tup[0].replace(str(occup_tup[2]), morph) for morph in decline(occup_tup[2], occup_tup[1])] 
    else:
        all_morphs = decline(occup_tup[0], occup_tup[1])
    occups_declined.append([str(occup_tup[0]), all_morphs])

In [9]:
occups_declined_dict = dict(occups_declined)

In [10]:
with open("../data/occups_declined_dict.json", "w") as fp:
    json.dump(occups_declined_dict, fp, indent=4)

In [31]:
#uncomment for export to google sheets:
occups_declined_df = pd.DataFrame(occups_declined)
set_with_dataframe(occupations.add_worksheet("occupations_decl", 1, 1), occups_declined_df)