In [1]:
import geopandas as gpd
import pandas as pd
import re
import json

In [126]:
occupations_df = pd.read_csv("../data/occupations_list_hisco.csv")
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,,Petrikovits 1981a,9.0,99.0,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [127]:
def update_vocab_nom_sg(row):
    vocab_nom_sg = row["Vocab_nom_sg"]
    if isinstance(vocab_nom_sg, float):
        vocab_nom_sg = row["Term"]
    return vocab_nom_sg

occupations_df["Vocab_nom_sg"] = occupations_df.apply(lambda row: update_vocab_nom_sg(row), axis=1)
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,abetarius,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,abietarius,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,accomodator,Petrikovits 1981a,9.0,99.0,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,aceptor,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [128]:
occup_tups = [(occup_nom, occup_gen, word_to_dec, term2) for occup_nom, occup_gen, word_to_dec, term2 in zip(occupations_df["Term"], occupations_df["gen_sg"], occupations_df["Vocab_nom_sg"], occupations_df["Term2"])]
occup_tups[:5]

[('abetarius', 'i', 'abetarius', nan),
 ('abietarius', 'i', 'abietarius', nan),
 ('acceptor', 'oris', 'acceptor', nan),
 ('accomodator', 'oris', 'accomodator', nan),
 ('aceptor', 'oris', 'aceptor', nan)]

In [129]:
# reorder from longest to shortest
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('negotiator artis vestiariae et lintiariae', 'oris', 'negotiator', nan),
 ('negotiator artis cretaria et vestiaria', 'oris', 'negotiator', nan),
 ('negotiator frumentariae et legumenaria', 'oris', 'negotiator', nan),
 ('negotiator suariae et pecuariae', 'oris', 'negotiator', nan),
 ('exactor auri argenti et aeris', 'oris', 'exactor', nan)]

In [130]:
organizations_df = pd.read_csv("../data/organizations_list.csv")
organizations_df.head(5)

Unnamed: 0,Term,gen_sg,Vocab_nom_sg,Source,Category,Translation_eng_LewisShort
0,colegium,i,colegium,Waltzing,Organization,variant spelling of collegium
1,collegium,i,collegium,Waltzing,Organization,"the connection of associates, colleagues, etc...."
2,collegiatus,i,collegiatus,Waltzing,Membership,"he who is with one in a society, college, corp..."
3,collegius,i,collegius,Waltzing,Membership,belonging to collegium
4,collega,ae,collega,Petra's addition,Membership,"member of collegium, a partner in office, a co..."


In [131]:
organ_tups = [(organ_nom, organ_gen, word_to_dec) for organ_nom, organ_gen, word_to_dec in zip(organizations_df["Term"].tolist(), organizations_df["gen_sg"].tolist(), organizations_df["Vocab_nom_sg"].tolist())]
organ_tups[:5]

[('colegium', 'i', 'colegium'),
 ('collegium', 'i', 'collegium'),
 ('collegiatus', 'i', 'collegiatus'),
 ('collegius', 'i', 'collegius'),
 ('collega', 'ae', 'collega')]

In [132]:
# reorder from longest to shortest
organ_tups = sorted(organ_tups, key = lambda x: len(x[0]), reverse=True)
organ_tups[:5]

[('collegatarius', 'i', 'collegatarius'),
 ('collegiarius', 'i', 'collegiarius'),
 ('collegiatus', 'i', 'collegiatus'),
 ('corporatus', 'i', 'corporatus'),
 ('sodalicium', 'i', 'sodalitium')]

In [133]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "o", "i", "orum", "is", "os"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ro", "ri", "rorum", "ris", "ros"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5a" : ["or", "oris", "ori", "orem", "ore", "ores", "orum", "oribus", "ores"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag
"third_m_5b" : ["ur", "uris", "urem", "uri", "ure", "ures", "uribus", "urum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7a" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_7b" : ["esis", "esis", "esem", "esi", "ese", "eses", "esibus", "esum", "esium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
"third_m_13" : ["us", "oris", "ori", "ore", "ora", "orum", "oribus"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}

In [134]:
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('negotiator artis vestiariae et lintiariae', 'oris', 'negotiator', nan),
 ('negotiator artis cretaria et vestiaria', 'oris', 'negotiator', nan),
 ('negotiator frumentariae et legumenaria', 'oris', 'negotiator', nan),
 ('negotiator suariae et pecuariae', 'oris', 'negotiator', nan),
 ('exactor auri argenti et aeris', 'oris', 'exactor', nan)]

In [135]:
term2_occup_tup = [tup for tup in occup_tups if tup[0] == "boarius"][0]
term2_occup_tup

('boarius', 'i', 'boarius', nan)

In [136]:
base_terms = [tup[0] for tup in occup_tups]
base_terms[:10]

['negotiator artis vestiariae et lintiariae',
 'negotiator artis cretaria et vestiaria',
 'negotiator frumentariae et legumenaria',
 'negotiator suariae et pecuariae',
 'exactor auri argenti et aeris',
 'negotiator penoris et vinorum',
 'negotiator salsari leguminari',
 'negotiator artis macellariae',
 'negotiator artis purpurariae',
 'negotiator cellarum vinarium']

In [137]:
problematic = [tup[3] for tup in occup_tups if ((tup[3] not in base_terms)  & (isinstance(tup[3], str)))]
problematic

['sagarius et pellicarius',
 'salsamentarius et vinarius',
 'soliarius baxiarius']

In [138]:
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = [str(root) + end for end in decs[pos_dec]] # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 


In [144]:
occups_declined = []
for occup_tup in occup_tups:
    base_form = occup_tup[0]
    if re.match("\w+\s\w+", base_form):
        all_morphs = [base_form.replace(str(occup_tup[2]), morph) for morph in decline(occup_tup[2], occup_tup[1])]
    else:
        if isinstance(occup_tup[3], str):
            if " " in occup_tup[3]:
                term_1_declinations = decline(base_form, occup_tup[1])
                if " et " in occup_tup[3]:
                    twoterms = occup_tup[3].split(" et ")
                    et = True
                else:
                    twoterms = occup_tup[3].split()
                    et = False
                term2_occup_tup = [tup for tup in occup_tups if tup[0] == twoterms[0]][0]
                term2_declinations = decline(term2_occup_tup[0], term2_occup_tup[1])
                term3_occup_tup = [tup for tup in occup_tups if tup[0] == twoterms[1]][0]
                term3_declinations = decline(term3_occup_tup[0], term2_occup_tup[1])
                if et == True:
                    all_morphs = [w1form + " " + w2form + " et " + w3form for w1form, w2form, w3form in zip(term_1_declinations, term2_declinations, term3_declinations)]
                    all_morphs += [w1form + " " + term2_declinations[1] + " et " + term3_declinations[1] for w1form in term_1_declinations]
                    all_morphs += [w1form + " " + term2_declinations[6] + " et " + term3_declinations[6] for w1form in term_1_declinations]
                else:
                    all_morphs = [w1form + " " + w2form + w3form for w1form, w2form, w3form in zip(term_1_declinations, term2_declinations, term3_declinations)]
                    all_morphs += [w1form + " " + term2_declinations[1] + " " + term3_declinations[1] for w1form in term_1_declinations]
                    all_morphs += [w1form + " " + term2_declinations[6] + " " + term3_declinations[6] for w1form in term_1_declinations]
                base_form = all_morphs[0]
                print(list(set(all_morphs)))
            else:
                term_1_declinations = decline(base_form, occup_tup[1])
                term2_occup_tup = [tup for tup in occup_tups if tup[0] == occup_tup[3]][0]
                term2_declinations = decline(term2_occup_tup[0], term2_occup_tup[1])
                all_morphs = [w1form + " " + w2form for w1form, w2form in zip(term_1_declinations, term2_declinations)]
                all_morphs += [w1form + " " + term2_declinations[1] for w1form in term_1_declinations]
                all_morphs += [w1form + " " + term2_declinations[6] for w1form in term_1_declinations]
                base_form = all_morphs[0]
        else:
            all_morphs = decline(base_form, occup_tup[1])
    all_morphs = list(set(all_morphs))
    occups_declined.append([base_form, all_morphs])

['negotiatorem sagarium et pellicarium', 'negotiator sagarius et pellicarius', 'negotiator sagarii et pellicarii', 'negotiatori sagariorum et pellicariorum', 'negotiatorum sagarii et pellicarii', 'negotiatoribus sagariorum et pellicariorum', 'negotiatores sagarios et pellicarios', 'negotiatore sagariorum et pellicariorum', 'negotiatores sagariorum et pellicariorum', 'negotiatorem sagariorum et pellicariorum', 'negotiatori sagarii et pellicarii', 'negotiatorem sagarii et pellicarii', 'negotiatoribus sagariis et pellicariis', 'negotiatori sagario et pellicario', 'negotiatore sagario et pellicario', 'negotiatore sagarii et pellicarii', 'negotiatoribus sagarii et pellicarii', 'negotiator sagariorum et pellicariorum', 'negotiatoris sagarii et pellicarii', 'negotiatoris sagariorum et pellicariorum', 'negotiatores sagarii et pellicarii', 'negotiatorum sagariorum et pellicariorum']
['negotiatorem salsamentarium et vinarium', 'negotiator salsamentarius et vinarius', 'negotiatoribus salsamentari

In [147]:
occups_declined = sorted(occups_declined, key = lambda x: len(x[0]), reverse=True)
[el[0] for el in occups_declined][:20]

['negotiator artis vestiariae et lintiariae',
 'negotiator artis cretaria et vestiaria',
 'negotiator frumentariae et legumenaria',
 'negotiator salsamentarius et vinarius',
 'negotiator sagarius et pellicarius',
 'negotiator suariae et pecuariae',
 'exactor auri argenti et aeris',
 'negotiator penoris et vinorum',
 'negotiator salsari leguminari',
 'negotiator artis macellariae',
 'negotiator artis purpurariae',
 'negotiator cellarum vinarium',
 'negotiator artis prossariae',
 'negotiator artis vestiariae',
 'negotiator artis ratiariae',
 'inclusor auri et gemmarum',
 'negotiator artis cretaria',
 'negotiator campi pecuarii',
 'negotiator manticularius',
 'negotiator margaritarius']

In [148]:
organizations_declined = []
for organ_tup in organ_tups:
    if re.match("\w+\s\w+", organ_tup[0]):
        all_morphs = [organ_tup[0].replace(str(organ_tup[2]), morph) for morph in decline(organ_tup[2], organ_tup[1])]
    else:
        all_morphs = decline(organ_tup[0], organ_tup[1])
    organizations_declined.append([str(organ_tup[0]), all_morphs])

In [149]:
occups_declined_dict = dict(occups_declined)
organizations_declined_dict = dict(organizations_declined)

In [150]:
with open("../data/occups_declined_dict.json", "w") as fp:
    json.dump(occups_declined_dict, fp, indent=4)
    
with open("../data/organizations_declined_dict.json", "w") as fp:
    json.dump(organizations_declined_dict, fp, indent=4)