# Requirements

In [4]:
import os
import json
import pandas as pd
import numpy as np
import json
import logging ### to monitor the code
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
from urllib.request import urlopen 
import io
import getpass
import re
from collections import defaultdict
from itertools import islice # to iterate through dicts

import nltk
from nltk.collocations import *

#!pip3 install pyconll #
import pyconll # universal dependencies parser

### plotting
### to use latex (important for greek fonts)
#! apt-get install texlive-latex-recommended 
#! apt install texlive-latex-extra
#! apt install dvipng
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

#from anda import gr
# OR uncomment the following:
script_url = "https://raw.githubusercontent.com/sdam-au/anda_py/master/anda/gr.py"
exec(requests.get(script_url).content)

In [6]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [8]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1iVta_FuEDgUM_Lf_yByrdbbXNoVH_dnVZs6QRyYv1NM/edit?usp=sharing")

# AGT_metadata = gc.open_by_url("https://docs.google.com/spreadsheets/d/1hEUnL3E07F-EnE3wYnk1V91aXfPDrcnhFHKjD-04CM0/edit?usp=sharing")

In [9]:
# import the data
AGT = sddk.read_file("SDAM_data/AGT/AGT_dated_20201027.json", "df", conf)

In [10]:
AGT_string = " ".join(AGT["string"].tolist())
len(AGT_string)

219532769

In [11]:
test_string = "ἀρχόμενος ³σέο, Φοῖβε, text in latin also CAPITAL ⁴παλαιγενέων 1234 κλέα 34056 φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ. τοίην γὰρ Πελίης φάτιν ἔκλυεν, ὥς μιν ὀπίσσω μοῖρα μένει στυγερή, τοῦδʼ ἀνέρος, ὅντινʼ ἴδοιτο δημόθεν οἰοπέδιλον, ὑπʼ ἐννεσίῃσι δαμῆναι."
test_string

'ἀρχόμενος ³σέο, Φοῖβε, text in latin also CAPITAL ⁴παλαιγενέων 1234 κλέα 34056 φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ. τοίην γὰρ Πελίης φάτιν ἔκλυεν, ὥς μιν ὀπίσσω μοῖρα μένει στυγερή, τοῦδʼ ἀνέρος, ὅντινʼ ἴδοιτο δημόθεν οἰοπέδιλον, ὑπʼ ἐννεσίῃσι δαμῆναι.'

In [12]:
# remove numerical superscripts
sups = ["¹", "²", "³", "⁴","⁵", "⁶", "⁷", "⁰", "⁹"]
def remove_sups(string):
    for sup in sups:
        string = string.replace(sup, "")
    return string

def grave_to_acute(string):
    GRAVE = "\u0300"
    ACUTE = "\u0301"
    return unicodedata.normalize("NFC", "".join(unicodedata.normalize("NFD", string).replace(GRAVE, ACUTE)))

test_string_1 = re.sub("[a-zA-Z0-9]", "", test_string)
test_string_1

'ἀρχόμενος ³σέο, Φοῖβε,      ⁴παλαιγενέων  κλέα  φωτῶν μνήσομαι, οἳ Πόντοιο κατὰ στόμα καὶ διὰ πέτρας Κυανέας βασιλῆος ἐφημοσύνῃ Πελίαο χρύσειον μετὰ κῶας ἐύζυγον ἤλασαν Ἀργώ. τοίην γὰρ Πελίης φάτιν ἔκλυεν, ὥς μιν ὀπίσσω μοῖρα μένει στυγερή, τοῦδʼ ἀνέρος, ὅντινʼ ἴδοιτο δημόθεν οἰοπέδιλον, ὑπʼ ἐννεσίῃσι δαμῆναι.'

In [13]:
test_string_2 = remove_sups(test_string_1)
test_string_3 = grave_to_acute(test_string_2)
print(test_string_3.split())

['ἀρχόμενος', 'σέο,', 'Φοῖβε,', 'παλαιγενέων', 'κλέα', 'φωτῶν', 'μνήσομαι,', 'οἵ', 'Πόντοιο', 'κατά', 'στόμα', 'καί', 'διά', 'πέτρας', 'Κυανέας', 'βασιλῆος', 'ἐφημοσύνῃ', 'Πελίαο', 'χρύσειον', 'μετά', 'κῶας', 'ἐύζυγον', 'ἤλασαν', 'Ἀργώ.', 'τοίην', 'γάρ', 'Πελίης', 'φάτιν', 'ἔκλυεν,', 'ὥς', 'μιν', 'ὀπίσσω', 'μοῖρα', 'μένει', 'στυγερή,', 'τοῦδʼ', 'ἀνέρος,', 'ὅντινʼ', 'ἴδοιτο', 'δημόθεν', 'οἰοπέδιλον,', 'ὑπʼ', 'ἐννεσίῃσι', 'δαμῆναι.']


In [14]:
# add column with sentences

def get_sentences(string):
  sentences = [s.strip() for s in re.split("\·|\.|\:|\;", unicodedata.normalize("NFC", string))]
  return sentences

AGT["sentences"] = AGT["string"].apply(get_sentences)

In [15]:
AGT.head(5)

Unnamed: 0,filename,author,title,string,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,"[ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,\nΘουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶ...,150126,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,[Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,110773,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,[Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβά...
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,\n̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα...,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,"[̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,[τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ...


# Lemmatization

In [17]:
%%time
AGT["lemmata"] = AGT.apply(lambda row: lemmatize_string(row["string"], all_lemmata=False, filter_by_postag=["n", "a", "v"], involve_unknown=True), axis=1)

CPU times: user 2min 26s, sys: 2.2 s, total: 2min 28s
Wall time: 2min 29s


In [18]:
all_lemmata = []
for work in AGT["lemmata"].tolist():
    all_lemmata.extend(work)

In [21]:
AGT["lemmata_wordcount"] = AGT.apply(lambda row: len(row["lemmata"]), axis=1)
AGT["lemmata_wordcount"].sum() # previously we had 13925726, then 13713183

13812934

In [None]:
%%time
def lemmatize_sentences(sentences):
    lemmatized_sentences = []
    for sentence in sentences:
        lemmatized_sentence = lemmatize_string(sentence, all_lemmata=False, filter_by_postag=["n", "a", "v"], involve_unknown=True)
        lemmatized_sentences.append(lemmatized_sentence)
    return lemmatized_sentences

AGT["lemmatized_sentences"] = AGT["sentences"].apply(lemmatize_sentences)

In [None]:
AGT["n_sentences"] = AGT['lemmatized_sentences'].apply(lambda x: len(x))

In [None]:
# look at some string
# e.g. paul's letter to the Galatians
AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["string"][:1000]

In [None]:
# and its lemmatized version
print(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["lemmata"][:100])

In [None]:
# straightforward corrections might be added here

corrections = {
    "ἔχις" : "ἔχω"
}

for key in corrections.keys():
    AGT["lemmata"] = AGT["lemmata"].apply(lambda word_list: [corrections[key] if x == key else x for x in word_list])
    AGT["lemmatized_sentences"] = AGT["lemmatized_sentences"].apply(lambda sentences_list: [[corrections[key] if x == key else x for x in sentence] for sentence in sentences_list])

In [None]:
sddk.write_file("SDAM_data/AGT/AGT_20201110.json", AGT, conf)