In [65]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import pickle
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import json
from concurrent.futures import ThreadPoolExecutor
import os
import nltk
import unicodedata

# Prepare AGILe for usage (skip for the second time)

In [2]:
%%bash
git clone https://github.com/agile-gronlp/agile

Cloning into 'agile'...


In [3]:
%%bash
echo '/scripts/agile/' >> ../.gitignore

In [4]:
%%bash
pip install -r agile/requirements.txt

Collecting cltk>=1.0
  Using cached cltk-1.1.5-py3-none-any.whl (844 kB)
Collecting stanza>=1.2
  Using cached stanza-1.4.2-py3-none-any.whl (691 kB)
Collecting Levenshtein
  Using cached Levenshtein-0.20.5-cp39-cp39-macosx_11_0_arm64.whl (95 kB)
Collecting boltons<22.0.0,>=21.0.0
  Using cached boltons-21.0.0-py2.py3-none-any.whl (193 kB)
Collecting scikit-learn<2.0.0,>=1.0.2
  Using cached scikit_learn-1.1.2-cp39-cp39-macosx_12_0_arm64.whl (7.7 MB)
Collecting gitpython<4.0,>=3.0
  Using cached GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting nltk<4.0,>=3.7
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting greek-accentuation<2.0.0,>=1.2.0
  Using cached greek_accentuation-1.2.0-py2.py3-none-any.whl (6.8 kB)
Collecting stringcase<2.0,>=1.2
  Using cached stringcase-1.2.0-py3-none-any.whl
Collecting python-Levenshtein<0.13.0,>=0.12.0
  Using cached python_Levenshtein-0.12.2-cp39-cp39-macosx_11_0_arm64.whl
Collecting fasttext<0.10.0,>=0.9.1
  Using cached fasttext-0.9.2-c

In [5]:
%%bash
rm -rf agile/.git agile/.gitignore

In [6]:
import stanza
stanza.download(lang='grc')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-10-04 11:58:32 INFO: Downloading default packages for language: grc (Ancient_Greek) ...


Downloading https://huggingface.co/stanfordnlp/stanza-grc/resolve/v1.4.1/models/default.zip:   0%|          | …

2022-10-04 11:58:37 INFO: Finished downloading models and saved to /Users/kasev/stanza_resources.


In [7]:
%%bash
cp agile/grc_agile_lemmatizer.pt grc_agile_lemmatizer.pt
cp agile/lexicon.p lexicon.p

# Testing lemmatization

In [2]:
from scripts.agile import agile



In [3]:
doc = agile.lemmatize("αἲξ θύεται τάδε μὴ ἐσφέρεν ἐς τὸ τέμενος τοῦ Ἀπόλλωνος τοῦ Οὐλίου εἱμάτιον")
for sent in doc.sentences:
    for word in sent.words:
        print(f'word: {word.text + " ":15}lemma: {word.lemma}')

word: αἲξ            lemma: αἴξ
word: θύεται         lemma: τίθημι
word: τάδε           lemma: ὅδε
word: μὴ             lemma: μή
word: ἐσφέρεν        lemma: τηρέω
word: ἐς             lemma: εἰς
word: τὸ             lemma: τε
word: τέμενος        lemma: τέμενος
word: τοῦ            lemma: ποῦ
word: Ἀπόλλωνος      lemma: Ἀπολλωνία
word: τοῦ            lemma: ποῦ
word: Οὐλίου         lemma: ἥλιος
word: εἱμάτιον       lemma: εἱμάτιον


In [4]:
# a more staightforward approach to get a list of lemmata:
[word["lemma"] for sent in doc.to_dict() for word in sent]

['αἴξ',
 'τίθημι',
 'ὅδε',
 'μή',
 'τηρέω',
 'εἰς',
 'τε',
 'τέμενος',
 'ποῦ',
 'Ἀπολλωνία',
 'ποῦ',
 'ἥλιος',
 'εἱμάτιον']

# Loading Perseus dict for filtering

In [3]:
morpheus_by_lemma = json.loads(requests.get("https://sciencedata.dk/public/8fe7d59de1eafe5f8eaebc0044534606/morpheus_by_lemma.json").content)

In [None]:
# first try to read the morpheus dict from our local data folder:
try:
    with open("../data/large_data/morpheus_dict.json") as json_file:
        morpheus_dict = json.load(json_file)
# if not available:
except:
    # read it from a public folder on sciencedata
    publicfolder = "8fe7d59de1eafe5f8eaebc0044534606"
    morpheus_dict = json.loads(requests.get("https://sciencedata.dk/public/" + publicfolder + "/morpheus_dict.json").content)
    # save it locally for future usage
    with open("../data/large_data/morpheus_dict.json", "w") as outfile:
        json.dump(morpheus_dict, outfile)

In [38]:
morpheus_lemmata_remove = []
for key in morpheus_dict.keys():
    values_list = morpheus_dict[key]
    for val_dict in values_list:
        if val_dict["l"] == key:
            if val_dict["p"][0] not in ["n", "a", "v"]:
                morpheus_lemmata_remove.append(key)

# Testing on a subset of GIST

In [5]:
GIST = gpd.read_file("../data/large_data/GIST_v0-1.geojson", driver="GeoJSON")

In [6]:
def lemmatize_with_agile(raw_text):
    try:
        doc = agile.lemmatize(raw_text)
        lemmata = [word["lemma"] for sent in doc.to_dict() for word in sent]
    except:
        lemmata = []
    return lemmata

In [7]:
lemmatize_with_agile("αἲξ θύεται τάδε μὴ ἐσφέρεν ἐς τὸ τέμενος τοῦ Ἀπόλλωνος τοῦ Οὐλίου εἱμάτιον")

['αἴξ',
 'τίθημι',
 'ὅδε',
 'μή',
 'τηρέω',
 'εἰς',
 'τε',
 'τέμενος',
 'ποῦ',
 'Ἀπολλωνία',
 'ποῦ',
 'ἥλιος',
 'εἱμάτιον']

In [11]:
%%time
# testing with samples of different length
GIST_sample = GIST.sample(100)
GIST_sample["lemmata_agile"] = GIST_sample["clean_text_interpretive_word"].apply(lemmatize_with_agile)

CPU times: user 32.6 s, sys: 1.69 s, total: 34.3 s
Wall time: 49.5 s


In [8]:
def lemmata_by_phiid(phiid):
    lemmata = lemmatize_with_agile(GIST[GIST["PHI_ID"]==phiid]["clean_text_interpretive_word"].tolist()[0])
    id_with_lemmata = " ".join([str(phiid)] + lemmata)
    return id_with_lemmata

In [13]:
%%time
failed = []
f = open("../data/large_data/lemmata_full_test.txt", "w", encoding="utf-8")
line = 0
for n in range(20):
    id = str(GIST.iloc[n]["PHI_ID"])
    lemmata = lemmatize_with_agile(GIST.iloc[n]["clean_text_interpretive_word"])
    line_data = " ".join([str(id)] + lemmata) + "\n"
    f.writelines(line_data)
    line += 1

CPU times: user 17.7 s, sys: 445 ms, total: 18.1 s
Wall time: 20.1 s


# Main application of AGILe is happening outside of the notebook - by running lemmatization.py

In [69]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)

In [70]:
def preprocess_lemmata_txt(lemmata_full_str):
    lemmata_data_split =  [normalize_encoding(inscr_data).split() for inscr_data in lemmata_full_str.split("\n")]
    lemmata_data_tups = [(int(inscr_data[0]), inscr_data[1:]) for inscr_data in lemmata_data_split]
    lemmata_data_dict = dict(lemmata_data_tups)
    return lemmata_data_dict

In [71]:
filenames = os.listdir("../data/large_data/lemmata_files")
filenames

['lemmata_full_160000-220000.txt',
 'lemmata_full_missing1.txt',
 'lemmata_full_missing2.txt',
 'lemmata_full_100000-160000.txt',
 'lemmata_full_50000-100000.txt',
 'lemmata_full_0-50000.txt']

In [72]:
lemmata_full_merged = {}
for fname in filenames:
    lemmata_full_str = open("../data/large_data/lemmata_files/" + fname, "r", encoding="utf-8").read()
    lemmata_full_merged.update(preprocess_lemmata_txt(lemmata_full_str))
len(lemmata_full_merged)

98447

In [73]:
lemmata_merged = []
for k,v in lemmata_full_merged.items():
    lemmata_merged.extend(v)

In [76]:
lemmata_merged.count("θεός")

10518

In [77]:
lemmata_merged.count("δίκαιος")

491

In [78]:
nltk.FreqDist(lemmata_merged).most_common()[:100]

[('ὁ', 169208),
 ('καὶ', 109671),
 ('τε', 44382),
 ('τίν', 39925),
 ('τὶς', 37483),
 ('ποῦ', 31188),
 ('ἐν', 26131),
 ('δὲ', 22507),
 ('ἐπί', 22364),
 ('τι', 22176),
 ('γῶν', 21334),
 ('αὐτός', 19722),
 ('ὅς', 15179),
 ('εἰμί', 14495),
 ('εἰς', 14350),
 ('δῆμος', 13643),
 ('οὐ', 13527),
 ('πᾶς', 12199),
 ('τοῖρ', 11298),
 ('ἐκ', 11171),
 ('ἄλλος', 10970),
 ('θεός', 10518),
 ('κατά', 9775),
 ('ἄρχων', 9156),
 ('οὗτος', 8929),
 ('βουλή', 7955),
 ('ἱερός', 7016),
 ('ὡς', 6747),
 ('ἔχω', 6658),
 ('βράχoς', 6321),
 ('ἀπό', 6294),
 ('πρός', 6146),
 ('παρά', 6129),
 ('κα', 5976),
 ('ἔτος', 5967),
 ('μή', 5798),
 ('ἀγαθός', 5761),
 ('ὅδε', 5241),
 ('μείς', 5129),
 ('τις', 5029),
 ('χαίρω', 5007),
 ('ἀνατίθημι', 4851),
 ('τῷ', 4563),
 ('περί', 4494),
 ('γυνή', 4456),
 ('λωτοφάγoi', 4412),
 ('ἑαυτοῦ', 4366),
 ('δραχμή', 4242),
 ('τιμή', 4217),
 ('ἀνήρ', 4194),
 ('Ἀπολλωνία', 4186),
 ('στρατιώτης', 4154),
 ('υἱός', 4021),
 ('εἰ', 4016),
 ('στέφανος', 4000),
 ('Δελφοί', 3939),
 ('ἄν', 3935),
 ('δύ