In [5]:
!pip install nltk sddk gspread gspread_dataframe kaleido

# REQUIREMNTS
import pandas as pd
import nltk
import requests
import re
import json

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

Collecting kaleido
  Downloading kaleido-0.1.0-py2.py3-none-manylinux1_x86_64.whl (74.6 MB)
[K     |████████████████████████████████| 74.6 MB 106 kB/s eta 0:00:01
Installing collected packages: kaleido
Successfully installed kaleido-0.1.0


In [10]:
# to access gsheet (from where we read our organization data), you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
organizations = gc.open_by_url("https://docs.google.com/spreadsheets/d/1nONTEwp42CVnq3iCiONrFbJedIcYtBV-l4Bil5mU7Eo/edit?usp=sharing")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ········
endpoint variable has been configured to: https://sciencedata.dk/files/


In [29]:
# read organization data
organizations_raw = get_as_dataframe(organizations.worksheet("Organization"))
organizations_raw = organizations_raw[organizations_raw['Term'].notnull()]
organizations_df = organizations_raw[organizations_raw.columns[:6]]
organizations_df

Unnamed: 0,Term,gen_sg,Vocab_nom_sg,Source,Category,Translation_eng_LewisShort
0,colegium,i,colegium,Waltzing,Organization,variant spelling of collegium
1,collegium,i,collegium,Waltzing,Organization,"the connection of associates, colleagues, etc...."
2,collegiatus,i,collegiatus,Waltzing,Membership,"he who is with one in a society, college, corp..."
3,collegius,i,collegius,Waltzing,Membership,belonging to collegium
4,collega,ae,collega,Petra's addition,Membership,"member of collegium, a partner in office, a co..."
5,concilium,i,concilium,Waltzing,Organization,"a collection of people, an association, gather..."
6,conlegium,i,conlegium,Waltzing,Organization,variant spelling of collegium
7,collegatarius,i,collegatarius,LewisShort,Membership,a person to whom is bequeathed a legacy in com...
8,collegiarius,i,collegiarius,LewisShort,Membership,collegial
9,corpus,oris,corpus,Waltzing,Organization,an organisation


In [12]:
organ_tups = [(organ_nom, organ_gen, word_to_dec) for organ_nom, organ_gen, word_to_dec in zip(organizations_df["Term"].tolist(), organizations_df["gen_sg"].tolist(), organizations_df["Vocab_nom_sg"].tolist())]
organ_tups[:5]

[('colegium', 'i', 'colegium'),
 ('collegium', 'i', 'collegium'),
 ('collegiatus', 'i', 'collegiatus'),
 ('collegius', 'i', 'collegius'),
 ('collega', 'ae', 'collega')]

In [13]:
# reorder from longest to shortest
organ_tups = sorted(organ_tups, key = lambda x: len(x[0]), reverse=True)
organ_tups[:5]

[('collegatarius', 'i', 'collegatarius'),
 ('collegiarius', 'i', 'collegiarius'),
 ('collegiatus', 'i', 'collegiatus'),
 ('corporatus', 'i', 'corporatus'),
 ('sodalicium', 'i', 'sodalitium')]

In [28]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "orum", "os", "is"],
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ros", "rorum", "ris"],

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5" : ["or", "oris", "orem", "ori", "ore", "ores", "oribus", "orum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
"third_m_13" : ["us", "oris", "ori", "ore", "ora", "orum", "oribus"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}


In [23]:
organ_tups = sorted(organ_tups, key = lambda x: len(x[0]), reverse=True)
organ_tups[:5]

[('collegatarius', 'i', 'collegatarius'),
 ('collegiarius', 'i', 'collegiarius'),
 ('collegiatus', 'i', 'collegiatus'),
 ('corporatus', 'i', 'corporatus'),
 ('sodalicium', 'i', 'sodalitium')]

In [24]:
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = list(set([str(root) + end for end in decs[pos_dec]])) # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 
organizations_declined = []
for organ_tup in organ_tups:
    if re.match("\w+\s\w+", organ_tup[0]):
        all_morphs = [organ_tup[0].replace(str(organ_tup[2]), morph) for morph in decline(organ_tup[2], organ_tup[1])] 
    else:
        all_morphs = decline(organ_tup[0], organ_tup[1])
    organizations_declined.append([str(organ_tup[0]), all_morphs])

In [25]:
organizations_declined_dict = dict(organizations_declined)

In [27]:
with open("../data/organizations_declined_dict.json", "w") as fp:
    json.dump(organizations_declined_dict, fp, indent=4)

In [26]:
#uncomment for export to google sheets:
organizations_declined_df = pd.DataFrame(organizations_declined)
set_with_dataframe(organizations.add_worksheet("organizations_decl", 1, 1), organizations_declined_df)