In [24]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import pickle
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import json
from concurrent.futures import ThreadPoolExecutor
import os
import nltk
import unicodedata

# Prepare AGILe for usage (skip for the second time)

In [2]:
%%bash
git clone https://github.com/agile-gronlp/agile

Cloning into 'agile'...


In [3]:
%%bash
echo '/scripts/agile/' >> ../.gitignore

In [4]:
%%bash
pip install -r agile/requirements.txt

Collecting cltk>=1.0
  Using cached cltk-1.1.5-py3-none-any.whl (844 kB)
Collecting stanza>=1.2
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting Levenshtein
  Using cached Levenshtein-0.20.5-cp39-cp39-macosx_11_0_arm64.whl (95 kB)
Collecting boltons<22.0.0,>=21.0.0
  Using cached boltons-21.0.0-py2.py3-none-any.whl (193 kB)
Collecting scikit-learn<2.0.0,>=1.0.2
  Using cached scikit_learn-1.1.2-cp39-cp39-macosx_12_0_arm64.whl (7.7 MB)
Collecting gitpython<4.0,>=3.0
  Using cached GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting nltk<4.0,>=3.7
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting greek-accentuation<2.0.0,>=1.2.0
  Using cached greek_accentuation-1.2.0-py2.py3-none-any.whl (6.8 kB)
Collecting stringcase<2.0,>=1.2
  Using cached stringcase-1.2.0-py3-none-any.whl
Collecting python-Levenshtein<0.13.0,>=0.12.0
  Using cached python_Levenshtein-0.12.2-cp39-cp39-macosx_11_0_arm64.whl
Collecting fasttext<0.10.0,>=0.9.1
  Using cached fasttext-0.9.2-c

In [5]:
%%bash
rm -rf agile/.git agile/.gitignore

In [6]:
import stanza
stanza.download(lang='grc')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-10-04 11:58:32 INFO: Downloading default packages for language: grc (Ancient_Greek) ...


Downloading https://huggingface.co/stanfordnlp/stanza-grc/resolve/v1.4.1/models/default.zip:   0%|          | …

2022-10-04 11:58:37 INFO: Finished downloading models and saved to /Users/kasev/stanza_resources.


In [7]:
%%bash
cp agile/grc_agile_lemmatizer.pt grc_agile_lemmatizer.pt
cp agile/lexicon.p lexicon.p

# Testing lemmatization

In [18]:
from scripts.agile import agile

In [19]:
doc = agile.lemmatize("αἲξ θύεται τάδε μὴ ἐσφέρεν ἐς τὸ τέμενος τοῦ Ἀπόλλωνος τοῦ Οὐλίου εἱμάτιον")
for sent in doc.sentences:
    for word in sent.words:
        print(f'word: {word.text + " ":15}lemma: {word.lemma}')

word: αἲξ            lemma: αἴξ
word: θύεται         lemma: τίθημι
word: τάδε           lemma: ὅδε
word: μὴ             lemma: μή
word: ἐσφέρεν        lemma: τηρέω
word: ἐς             lemma: εἰς
word: τὸ             lemma: τε
word: τέμενος        lemma: τέμενος
word: τοῦ            lemma: ποῦ
word: Ἀπόλλωνος      lemma: Ἀπολλωνία
word: τοῦ            lemma: ποῦ
word: Οὐλίου         lemma: ἥλιος
word: εἱμάτιον       lemma: εἱμάτιον


In [20]:
# a more staightforward approach to get a list of lemmata:
[word["lemma"] for sent in doc.to_dict() for word in sent]

['αἴξ',
 'τίθημι',
 'ὅδε',
 'μή',
 'τηρέω',
 'εἰς',
 'τε',
 'τέμενος',
 'ποῦ',
 'Ἀπολλωνία',
 'ποῦ',
 'ἥλιος',
 'εἱμάτιον']

# Loading Perseus dict for filtering

In [3]:
morpheus_by_lemma = json.loads(requests.get("https://sciencedata.dk/public/8fe7d59de1eafe5f8eaebc0044534606/morpheus_by_lemma.json").content)

In [None]:
# first try to read the morpheus dict from our local data folder:
try:
    with open("../data/large_data/morpheus_dict.json") as json_file:
        morpheus_dict = json.load(json_file)
# if not available:
except:
    # read it from a public folder on sciencedata
    publicfolder = "8fe7d59de1eafe5f8eaebc0044534606"
    morpheus_dict = json.loads(requests.get("https://sciencedata.dk/public/" + publicfolder + "/morpheus_dict.json").content)
    # save it locally for future usage
    with open("../data/large_data/morpheus_dict.json", "w") as outfile:
        json.dump(morpheus_dict, outfile)

In [38]:
morpheus_lemmata_remove = []
for key in morpheus_dict.keys():
    values_list = morpheus_dict[key]
    for val_dict in values_list:
        if val_dict["l"] == key:
            if val_dict["p"][0] not in ["n", "a", "v"]:
                morpheus_lemmata_remove.append(key)

In [49]:
GIST = gpd.read_file("../data/large_data/GIST_v0-1.geojson", driver="GeoJSON")

In [91]:
def lemmatize_with_agile(raw_text):
    try:
        doc = agile.lemmatize(raw_text)
        lemmata = [word["lemma"] for sent in doc.to_dict() for word in sent]
    except:
        try:
            doc = agile.lemmatize(raw_text, use_lexicon=False)
            lemmata = [word["lemma"] for sent in doc.to_dict() for word in sent]
        except:
            lemmata = []
    return lemmata

In [72]:
lemmatize_with_agile("αἲξ θύεται τάδε μὴ ἐσφέρεν ἐς τὸ τέμενος τοῦ Ἀπόλλωνος τοῦ Οὐλίου εἱμάτιον")

['αἴξ',
 'τίθημι',
 'ὅδε',
 'μή',
 'τηρέω',
 'εἰς',
 'τε',
 'τέμενος',
 'ποῦ',
 'Ἀπολλωνία',
 'ποῦ',
 'ἥλιος',
 'εἱμάτιον']

In [11]:
%%time
# testing with samples of different length
GIST_sample = GIST.sample(100)
GIST_sample["lemmata_agile"] = GIST_sample["clean_text_interpretive_word"].apply(lemmatize_with_agile)

CPU times: user 32.6 s, sys: 1.69 s, total: 34.3 s
Wall time: 49.5 s


In [73]:
def lemmata_by_phiid(phiid):
    lemmata = lemmatize_with_agile(GIST[GIST["PHI_ID"]==phiid]["clean_text_interpretive_word"].tolist()[0])
    id_with_lemmata = " ".join([str(phiid)] + lemmata)
    return id_with_lemmata

In [89]:
lemmata_by_phiid(327714)

'327714    '

In [13]:
%%time
failed = []
f = open("../data/large_data/lemmata_full_test.txt", "w", encoding="utf-8")
line = 0
for n in range(20):
    id = str(GIST.iloc[n]["PHI_ID"])
    lemmata = lemmatize_with_agile(GIST.iloc[n]["clean_text_interpretive_word"])
    line_data = " ".join([str(id)] + lemmata) + "\n"
    f.writelines(line_data)
    line += 1

CPU times: user 17.7 s, sys: 445 ms, total: 18.1 s
Wall time: 20.1 s


In [102]:
GIST[GIST["PHI_ID"]==141705]

Unnamed: 0,PHI_ID,reference,metadata,lines,text_raw,text_iphi,clean_text_conservative,clean_text_interpretive_word,lemmata,raw_date,not_before,not_after,date_type,region_main_id,region_sub_id,TM_ID,EDH_ID,EDCS_ID,EDR_ID,LUPA_ID,ISic_ID,IG_ID,atticinscriptions_ID,Attalus_ID,TMgeo_ID,x_long,y_lat,TMgeo_name,geometry
36006,141705,"IG XIV\n2393,71b",Sikelia [Rhodos] — Eryx (Erice)?,1.0,Ἀμύντα.,,Ἀμύντα,Ἀμύντα,{'data': ['ἀμύντα']},Eryx (Erice)?,,,{'data': ['unknown']},,,{'data': ['493104']},{'data': []},{'data': ['39501016']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': ['22300']},12.583607,38.036517,Eryx,POINT (12.58361 38.03652)


In [101]:
lemmatize_with_agile(GIST[GIST["PHI_ID"]==141705]["clean_text_interpretive_word"].tolist()[0])

['']

# Main application of AGILe is happening outside of the notebook - by running lemmatization.py
but here we develop and test the functions...

# Loading the lemmatized data files and mapping them on the dataset

In [28]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)

In [29]:
def preprocess_lemmata_txt(lemmata_full_str):
    failed_ids = []
    lemmata_data_split =  [normalize_encoding(inscr_data).split() for inscr_data in lemmata_full_str.split("\n")]
    #lemmata_data_tups = [(int(inscr_data[0]), inscr_data[1:]) for inscr_data in lemmata_data_split]
    lemmata_data_tups = []
    for inscr_data in lemmata_data_split:
        if len(inscr_data) > 1:
            lemmata_data_tups.append((int(inscr_data[0]), inscr_data[1:]))
        else:
            try:
                failed_ids.append(int(inscr_data[0]))
            except:
                failed_ids.append(inscr_data)
    lemmata_data_dict = dict(lemmata_data_tups)
    return lemmata_data_dict, failed_ids

In [33]:
filenames = os.listdir("../data/large_data/lemmata_files")
filenames = [f for f in filenames if ".txt" in f]
filenames

['lemmata_full_inverse2.txt',
 'lemmata_full_missing4.txt',
 'lemmata_full_160000-220000.txt',
 'lemmata_full_inverse1.txt',
 'lemmata_full_missing1.txt',
 'lemmata_full_missing3.txt',
 'lemmata_full_missing2.txt',
 'lemmata_full_100000-160000.txt',
 'lemmata_full_50000-100000.txt',
 'lemmata_full_0-50000.txt']

In [34]:
failed_ids = []
lemmata_full_merged = {}
for fname in filenames:
    lemmata_full_str = open("../data/large_data/lemmata_files/" + fname, "r", encoding="utf-8").read()
    file_dict, file_failed_ids = preprocess_lemmata_txt(lemmata_full_str)
    failed_ids.extend(file_failed_ids)
    lemmata_full_merged.update(file_dict)
len(lemmata_full_merged)

207734

In [35]:
failed_ids =list(set([el for el in failed_ids if isinstance(el, int)]))
len(failed_ids)

10130

In [36]:
GIST_failed = GIST.set_index("PHI_ID").loc[failed_ids]

In [37]:
GIST_failed_tups = list(zip(GIST_failed.reset_index()["PHI_ID"], GIST_failed["clean_text_interpretive_word"]))
GIST_failed_list = ([str(tup[0]) + " " + tup[1] for tup in GIST_failed_tups])
GIST_failed_list[:10]

['1 ἔδοχσεν το͂ι δέμοι τὸς ἐ Σαλαμῖνι κλερόχος οἰκε͂ν ἐᾶ Σαλαμῖνι λεν Ἀθένεσι τελε͂ν καὶ στρατεύεσθαι τὰ δ’ ἐ Σαλαμῖνι μὲ μισθο͂ν ἐὰ μὲ οἰκ ο μισθόμενο ἐὰν δὲ μισθο͂ι ἀποτίνεν τὸ μισθόμενον καὶ τὸ μισθο͂ντα ℎεκάτερον ἐς δεμόσιον ἐσπράτεν δὲ τὸν ἄρχοντα ἐὰν δὲ μέ εὐθύνεσθαι τὰ δὲ ℎόπλα παρέχεσθαι αὐτὸς τριάκοντα δραχμο͂ν ℎοπλισμένον δὲ τὸν ἄρχοντα τὰ ℎόπλα κρίνεν ἐπὶ τε͂ς βολε͂ς c',
 '327692 ',
 '327707 ',
 '327710 Χαιρίοˉν',
 '327714 ΜΙΝΩΡ ΛΙΩΝΟΣ ΜΑΘΕΥΕ ΜΕ',
 '327715 Μελίτωνος',
 '327721 ΛΑ',
 '327726 ',
 '327732 ΜΧΙΕΙΟΝ',
 '294973 Ἀνδρομάχου']

In [38]:
len(GIST_failed_list)

10130

In [108]:
# save for future usage
f = open("../data/large_data/GIST_failed_list.txt", "w", encoding="utf-8")
f.write("\n".join(GIST_failed_list))

247700

In [39]:
lemmata_full_str = open("../data/large_data/GIST_failed_list.txt", "r", encoding="utf-8").read()
file_dict, file_failed_ids = preprocess_lemmata_txt(lemmata_full_str)
lemmata_full_merged.update(file_dict)

In [40]:
len(lemmata_full_str.split("\n"))

10130

In [41]:
len(file_dict)

8507

In [42]:
len(lemmata_full_merged)

216240

In [43]:
lemmata_merged = []
for k,v in lemmata_full_merged.items():
    lemmata_merged.extend(v)

In [44]:
lemmata_merged.count("θεός")

22647

In [45]:
lemmata_merged.count("δικαιοσύνη")

732

In [46]:
lemmata_merged.count("Ζεύς")

7695

In [47]:
nltk.FreqDist(lemmata_merged).most_common()[:100]

[('ὁ', 309253),
 ('καὶ', 224130),
 ('τε', 87962),
 ('τίν', 82148),
 ('τὶς', 78041),
 ('ποῦ', 62771),
 ('ἐν', 50116),
 ('δὲ', 48576),
 ('γῶν', 44365),
 ('αὐτός', 43036),
 ('τι', 40802),
 ('ἐπί', 40184),
 ('εἰς', 29028),
 ('δῆμος', 28470),
 ('ὅς', 27421),
 ('εἰμί', 26486),
 ('οὐ', 25814),
 ('θεός', 22647),
 ('τοῖρ', 22549),
 ('πᾶς', 22348),
 ('ἐκ', 20571),
 ('κατά', 19256),
 ('οὗτος', 17017),
 ('ἔτος', 16608),
 ('ἄλλος', 16175),
 ('βουλή', 15949),
 ('μή', 13487),
 ('ἱερός', 13363),
 ('ἄρχων', 13106),
 ('ὡς', 12509),
 ('πρός', 12433),
 ('κα', 11924),
 ('τῷ', 11663),
 ('ἀπό', 11616),
 ('ἀγαθός', 11380),
 ('ἑαυτοῦ', 11039),
 ('παρά', 10985),
 ('βράχoς', 10979),
 ('ἔχω', 10584),
 ('υἱός', 10462),
 ('μείς', 10298),
 ('γυνή', 10173),
 ('τις', 10144),
 ('χαίρω', 9936),
 ('τῇ', 9852),
 ('χάρις', 9659),
 ('ὅδε', 9154),
 ('ἀνήρ', 9085),
 ('ὑπέρ', 8820),
 ('περί', 8609),
 ('ἀνατίθημι', 8345),
 ('ἱερεύς', 7732),
 ('ἄν', 7728),
 ('εἰ', 7713),
 ('Ἀπολλωνία', 7704),
 ('Ζεύς', 7695),
 ('μέν', 7446),
 ('

In [48]:
len(GIST)

217863

In [50]:
def add_agile_lemmata(phiid):
    try:
        lemmata = lemmata_full_merged[phiid]
    except:
        lemmata = []
    return {"data" : lemmata}
GIST["lemmata"] = GIST["PHI_ID"].apply(add_agile_lemmata)

In [51]:
GIST.head(5)

Unnamed: 0,PHI_ID,reference,metadata,lines,text_raw,text_iphi,clean_text_conservative,clean_text_interpretive_word,lemmata,raw_date,not_before,not_after,date_type,region_main_id,region_sub_id,TM_ID,EDH_ID,EDCS_ID,EDR_ID,LUPA_ID,ISic_ID,IG_ID,atticinscriptions_ID,Attalus_ID,TMgeo_ID,x_long,y_lat,TMgeo_name,geometry
0,1,IG I³\n1,Att. — Ath.: Akr. — stoich. 35 — c. 510-500 a....,12.0,ἔδοχσεν το͂ι δέμοι· τ̣[ὸς ἐ Σ]αλαμ̣[ῖνι κλερόχ...,εδοχσεν τοι δεμοι τ[ος ε σ]αλαμ[ινι κλεροχ]ος ...,ἔδοχσεν το͂ι δέμοι ταλαμος οἰκε͂ν ἐᾶ Σαλαμῖνι ...,ἔδοχσεν το͂ι δέμοι τὸς ἐ Σαλαμῖνι κλερόχος οἰκ...,"{'data': ['ἔδοχσεν', 'το͂ι', 'δέμοι', 'τὸς', '...",c. 510-500 a.,-511.0,-499.0,"{'data': ['range', 'phase', 'ca']}",1701.0,1700.0,{'data': ['786251']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': ['IG I³ 1']},{'data': ['1672']},{'data': []},{'data': ['364']},23.723985,37.972747,Athenai,POINT (23.72399 37.97275)
1,2,IG I³\n2,Att. — non-stoich. — c. 500 a.,14.0,[․․8-9․․․]ν̣ βολ — — — — — — — — — —\n[․6-7․․]...,[--------]ν βολ ---------- [------] α εκον ---...,ν βολ α ℎεκον σιον γνοσθε͂ι δὲ ν ἀτεχνος μὲ π ...,ν βολ α ℎεκον σιον γνοσθε͂ι δὲ ν ἀτεχνος μὲ π ...,"{'data': ['ν', 'βοῦς', 'α', 'θύος', 'γνώστης',...",c. 500 a.,-505.0,-495.0,"{'data': ['exact', 'phase', 'ca']}",1701.0,1700.0,{'data': ['786252']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': ['IG I³ 2']},{'data': ['1707']},{'data': []},{'data': ['373']},,,,
2,3,IG I³\n3,Att. — stoich. 21 — 490-480 a.,13.0,[․]αρ[․․․․]ι ℎερακλειο[․․5․․]\n[․]αρ̣ο#⁷[․] τι...,[-]αρ[----]ι ερακλειο[-----] [-]αρο [-] τιθενα...,αρι ℎερακλειο αρο τιθέναι τὸς ἀέτας τριάκοντα ...,αρι ℎερακλειο αρο τιθέναι τὸς ἀθλοθέτας τριάκο...,"{'data': ['Ἤρ', 'εἰσπλέω', 'Ἀθῆναι', 'τὶς', 'ἀ...",490-480 a.,-490.0,-480.0,{'data': ['range']},1701.0,1700.0,{'data': ['786253']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': ['IG I³ 3']},{'data': ['ii-1b']},{'data': []},{'data': ['373']},,,,
3,4,IG I³\n4,Att. — stoich. 38 — 485/4 a.,56.0,[․․․․․․․․․․․․․․․․․․38․․․․․․․․․․․․․․․․․․]\n[․․․...,[--------------------------------------] [----...,δέ τις αν ἒ φρορὰν μ ντέκοντα δχμὰς τ ας ℎες π...,ἐὰν δέ τις αν ἒ φρορὰν μὲ πεντέκοντα δραχμὰς τ...,"{'data': ['ἐάν', 'τις', 'ἐν', 'ἒ', 'φρουρά', '...",485/4 a.,-485.0,-484.0,{'data': ['range']},1701.0,1700.0,{'data': ['786254']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},"{'data': ['IG I³ 4(A)', 'IG I³ 4(B)']}",{'data': ['1692']},{'data': []},{'data': ['364']},23.723985,37.972747,Athenai,POINT (23.72399 37.97275)
4,5,IG I³\n5,Att. — c. 500 a.,6.0,[ἔδοχσε]ν [⋮ τε͂ι βολε͂ι] ⋮ καὶ [τ]ο͂ι δέμοι ⋮...,[εδοχσε]ν [ τει βολει] και [τ]οι δεμοι οτε παρ...,ν καὶ ο͂ι δέμοι ℎότε Παραιβάτες λεια θν τὸς ℎι...,ἔδοχσεν τε͂ι βολε͂ι καὶ το͂ι δέμοι ℎότε Παραιβ...,"{'data': ['δοκέω', 'βουλή', 'καὶ', 'ὁ', 'δῆμος...",c. 500 a.,-505.0,-495.0,"{'data': ['exact', 'phase', 'ca']}",1701.0,1700.0,{'data': ['786255']},{'data': []},{'data': []},{'data': []},{'data': []},{'data': []},{'data': ['IG I³ 5']},{'data': ['1284']},{'data': []},{'data': ['10345']},23.541498,38.044135,Eleusis,POINT (23.54150 38.04414)


In [52]:
GIST.to_file("../data/large_data/GIST_v0-2.geojson", driver="GeoJSON")

In [53]:
GIST.to_parquet("../data/large_data/GIST_v0-2.parquet")