# Requirements

In [10]:
# first, setup the kernel for virtual environment
from IPython.display import Javascript
Javascript("Jupyter.notebook.session.restart({kernel_name: 'socdiv_venv'})") # execution of javascript command
print("kernel successfully changed")

kernel successfully changed


In [21]:
import pandas as pd
import re
import json

In [10]:
occupations_df = pd.read_csv("../data/occupations_list_hisco.csv")
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,Petrikovits 1981a,9.0,99.0,Unclear meaning,,"uncertain, craftsman"
4,aceptor,oris,,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [11]:
occup_tups = [(occup_nom, occup_gen, word_to_dec) for occup_nom, occup_gen, word_to_dec in zip(occupations_df["Term"].tolist(), occupations_df["gen_sg"].tolist(), occupations_df["Vocab_nom_sg"].tolist())]
occup_tups[:5]

[('abetarius', 'i', nan),
 ('abietarius', 'i', nan),
 ('acceptor', 'oris', 'acceptor'),
 ('accomodator', 'oris', nan),
 ('aceptor', 'oris', nan)]

In [12]:
# reorder from longest to shortest
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('exactor auri argenti et aeris', 'oris', 'exactor'),
 ('inclusor auri et gemmarum', 'oris', 'inclusor'),
 ('tesserarius lignarius', 'i', 'tesserarius'),
 ('refector pectinarius', 'oris', 'refector'),
 ('instructor parietum', 'oris', 'instructor')]

In [13]:
organizations_df = pd.read_csv("../data/organizations_list.csv")
organizations_df.head(5)

Unnamed: 0,Term,gen_sg,Vocab_nom_sg,Source,Category,Translation_eng_LewisShort
0,colegium,i,colegium,Waltzing,Organization,variant spelling of collegium
1,collegium,i,collegium,Waltzing,Organization,"the connection of associates, colleagues, etc...."
2,collegiatus,i,collegiatus,Waltzing,Membership,"he who is with one in a society, college, corp..."
3,collegius,i,collegius,Waltzing,Membership,belonging to collegium
4,collega,ae,collega,Petra's addition,Membership,"member of collegium, a partner in office, a co..."


In [14]:
organ_tups = [(organ_nom, organ_gen, word_to_dec) for organ_nom, organ_gen, word_to_dec in zip(organizations_df["Term"].tolist(), organizations_df["gen_sg"].tolist(), organizations_df["Vocab_nom_sg"].tolist())]
organ_tups[:5]

[('colegium', 'i', 'colegium'),
 ('collegium', 'i', 'collegium'),
 ('collegiatus', 'i', 'collegiatus'),
 ('collegius', 'i', 'collegius'),
 ('collega', 'ae', 'collega')]

In [15]:
# reorder from longest to shortest
organ_tups = sorted(organ_tups, key = lambda x: len(x[0]), reverse=True)
organ_tups[:5]

[('collegatarius', 'i', 'collegatarius'),
 ('collegiarius', 'i', 'collegiarius'),
 ('collegiatus', 'i', 'collegiatus'),
 ('corporatus', 'i', 'corporatus'),
 ('sodalicium', 'i', 'sodalitium')]

In [16]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "orum", "os", "is"],
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ros", "rorum", "ris"],

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5a" : ["or", "oris", "orem", "ori", "ore", "ores", "oribus", "orum"],
"third_m_5b" : ["ur", "uris", "urem", "uri", "ure", "ures", "uribus", "urum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7a" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_7b" : ["esis", "esis", "esem", "esi", "ese", "eses", "esibus", "esum", "esium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
"third_m_13" : ["us", "oris", "ori", "ore", "ora", "orum", "oribus"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}

In [17]:
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('exactor auri argenti et aeris', 'oris', 'exactor'),
 ('inclusor auri et gemmarum', 'oris', 'inclusor'),
 ('tesserarius lignarius', 'i', 'tesserarius'),
 ('refector pectinarius', 'oris', 'refector'),
 ('instructor parietum', 'oris', 'instructor')]

In [18]:
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = list(set([str(root) + end for end in decs[pos_dec]])) # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 
occups_declined = []
for occup_tup in occup_tups:
    if re.match("\w+\s\w+", occup_tup[0]):
        all_morphs = [occup_tup[0].replace(str(occup_tup[2]), morph) for morph in decline(occup_tup[2], occup_tup[1])] 
    else:
        all_morphs = decline(occup_tup[0], occup_tup[1])
    occups_declined.append([str(occup_tup[0]), all_morphs])
    
organizations_declined = []
for organ_tup in organ_tups:
    if re.match("\w+\s\w+", organ_tup[0]):
        all_morphs = [organ_tup[0].replace(str(organ_tup[2]), morph) for morph in decline(organ_tup[2], organ_tup[1])] 
    else:
        all_morphs = decline(organ_tup[0], organ_tup[1])
    organizations_declined.append([str(organ_tup[0]), all_morphs])

In [19]:
occups_declined_dict = dict(occups_declined)
organizations_declined_dict = dict(organizations_declined)

In [23]:
with open("../data/occups_declined_dict.json", "w") as fp:
    json.dump(occups_declined_dict, fp, indent=4)
    
with open("../data/organizations_declined_dict.json", "w") as fp:
    json.dump(organizations_declined_dict, fp, indent=4)