# Plan

1. Remove failed texts
2. Figure out how to save frequencies to database
3. Start frequency computation
4. Script for importing Maximus (start new file for every heading-3, new chapter for heading-4)
6. Understand matching
7. Understand unitizing (word count per chapter, etc)

# Import and connect

In [1]:
import json
from bson import ObjectId
import pandas as pd

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers.sparse_encoding import SparseMatrixSearch
from tesserae.utils.calculations import get_text_frequencies

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesstest')
connection_cb = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesserae_cb')

# Load texts

Create metadata file

In [2]:
import os
import json

# Replace 'your_directory' with the path to your main directory containing .tess files
lang = 'greek'
main_directory = '/home/administrador/maximus-confessor/texts/grc'

# Initialize an empty list to store metadata
metadata = []

# Iterate through all subdirectories and files
for filename in os.scandir(main_directory):
    if filename.is_file():
        if filename.path.endswith(".tess"):
            # Split the filename into parts based on a delimiter (.)
            parts = filename.name.split('.')

            # Extract metadata information based on the filename structure
            if len(parts) == 3:
                author = parts[0]
                title = ' '.join(parts[1].split('_'))
                file_path = os.path.join(main_directory, filename)

                # Add the metadata to the list
                metadata.append({'author': author, 'title': title, 'path': file_path, 'language':lang, 'year':0})

# Define the output metadata file
metadata_file = 'data/text_metadata_'+lang+'.json'

# Write the metadata to a JSON file
with open(metadata_file, 'w') as file:
    json.dump(metadata, file, indent=4)

print(f"Metadata file '{metadata_file}' has been created.")


Metadata file 'data/text_metadata_greek.json' has been created.


Insert metadata into database

In [11]:
with open('data/text_metadata_greek.json', 'r') as f:
    text_meta = json.load(f)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
print('{}{}{}{}'.format('-----'.ljust(15), '------'.ljust(15), '--------'.ljust(15), '----'))
for t in text_meta:
    print('{}{}{}{}'.format(t['title'].ljust(15), t['author'].ljust(15), t['language'].ljust(15), str(t['year']).ljust(15)))

texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))

Title          Author         Language       Year
-----          ------         --------       ----
seven against thebesaeschylus      greek          0              
oedipus tyrannussophocles      greek          0              
bacchae        euripides      greek          0              
epitome        apollodorus    greek          0              
for phormio    demosthenes    greek          0              
consolatio ad apolloniumplutarch       greek          0              
hymns          callimachus    greek          0              
epistulae      basil_of_caesareagreek          0              
euthyphro      plato          greek          0              
de mirabilibus phlegon        greek          0              
clouds         aristophanes   greek          0              
olynthiac 3    demosthenes    greek          0              
on the embassy aeschines      greek          0              
de iside et osirideplutarch       greek          0              
against eubulidesdemosthe

Tokenize the texts

In [12]:
for text in texts[:3]:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer(connection) if tessfile.metadata.language == 'greek' else LatinTokenizer(connection)
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)    
    result = connection.insert(features)
    result = connection.insert(tokens)

    #unitizer = Unitizer()
    #lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    #result = connection.insert(lines + phrases)    
    print(text.title)

seven against thebes
oedipus tyrannus
bacchae


Remove failed texts:

In [17]:
connection_cb = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesserae_cb')

failed = ['orationes',
'antiquitates judaicae',
'iphigenia aulis',
'histories',
'antiquitates romanae',
'deipnosophists',
'geography']

connection_cb.connection['texts'].delete_many({'title': {'$in':failed}})

<pymongo.results.DeleteResult at 0x7f2816a75310>

Update paths

In [35]:
texts = connection_cb.find('texts')
for t in texts:
    p_old = t.path.split('/')
    p_new = '/'.join(p_old[:3]+['maximus-confessor']+p_old[4:])
    t.path = p_new
result = connection_cb.update(texts)

# Test lemmatization

Load the list of texts

In [142]:
texts = connection.find('texts')

Load the tokens from one of the texts

In [272]:
tokens = connection.find('tokens',text=texts[0].id)

In [273]:
len(tokens)

11896

Look up the lemma for the token:

In [121]:
print(tokens[0].display)
lem = connection.find('features',_id=tokens[0].features['lemmata'][0])
print(lem[0].token)

ἥκω
ἥκω


Look at some lemmas

In [15]:
connection.find('features',feature='lemmata')[:5]

[Feature(language=greek, feature=lemmata, token=ὄρνυμι, index=411, frequencies={'6509797141de82c578e8fd64': 4}),
 Feature(language=greek, feature=lemmata, token=σας, index=2065, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=τέχνη, index=162, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=κλαυστός, index=978, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=τώς, index=1310, frequencies={'6509797141de82c578e8fd64': 2})]

Find the corresponding text

In [97]:
connection.find('texts',_id=ObjectId('6509797141de82c578e8fd64'))

[Text(language=greek, title=seven against thebes, author=aeschylus, year=0, ingestion_status=['Initialized', ''], ingestion_details={}, path=/home/administrador/tesserae/texts/grc/aeschylus.seven_against_thebes.tess, is_prose=False)]

# Frequencies

Now implemented in my branch of tesserae-v5, in utils.calculations

In [143]:
def get_text_frequencies(connection,text,feature='lemmata'):
    result = connection.aggregate('tokens',[{'$match': {'text': ObjectId(text)}}, 
                                            {"$project": {"_id": 0, "features": 1}},
                                            {"$unwind": "$features."+feature},
                                            {'$group' : {"_id" : "$features."+feature , text : {"$sum": 1}}}],
                                  encode=False)
    freqs = pd.DataFrame(list(result)).set_index("_id")

    return freqs

In [133]:
text = '6509797141de82c578e8fd64'
freqs = get_text_frequencies(connection,text)

## Extract frequencies from "lemmata" entities

Not very useful, because frequencies fail to update on loading new texts

In [9]:
result = connection.aggregate('features',[ {'$match': { 'feature': 'lemmata' }}, 
                                          { "$project": {"data": { "$objectToArray": "$frequencies" }}},
                                          {'$unwind': '$data'}],
                     encode=False)

In [10]:
freqs = pd.DataFrame()
for element in result:
    freqs.loc[element['_id'],element['data']['k']] = element['data']['v']
freqs.fillna(value=0).min(axis=1).sort_values(ascending=False).iloc[:50]

# Try LAP

Import frequencies

In [3]:
texts = connection_cb.find('texts')
freqs = pd.DataFrame()
for text in texts:
    freqs=freqs.join(get_text_frequencies(connection_cb,str(text.id)),how='outer')
freqs = freqs.fillna(value=0).astype('int64')

KeyboardInterrupt: 

In [None]:
freqs = freqs.iloc[:-1,:]

In [19]:
freqs.to_excel('frequencies.xlsx')

Add labels

In [25]:
lemma_labels=[]
for idx in freqs.index:
    lemma_labels.append(connection_cb.find('features',_id=idx)[0].token)
freqs.index = lemma_labels

text_labels=[]
for t in freqs.keys():
    text_labels.append(connection_cb.find('texts',_id=ObjectId(t))[0].title)
freqs.columns = text_labels

AttributeError: 'list' object has no attribute 'title'

In [32]:
import scripts.lap_v2_py3 as lap_v2
import numpy as np

names = ['meno']

freqs_norm = freqs.copy()
N = np.shape(freqs)[0]
for item in freqs:
    freqs_norm[item] = lap_v2.rank_norm(np.asarray(freqs[item]), dist='normal',norm=N)

data = freqs_norm[names]
basis = freqs_norm.T.drop(names).T

[a, A, eta] = lap_v2.lap(basis.values, data.values, full_output=True)

projections = pd.DataFrame(data = a, index = basis.keys(), columns = names)
eta = pd.DataFrame(data = eta.T, index = data.index, columns = basis.keys())

In [36]:
eta.sort_values(by='alcibiades 1',ascending=False).iloc[:30]

Unnamed: 0,seven against thebes,oedipus tyrannus,bacchae,epitome,for phormio,consolatio ad apollonium,hymns,epistulae,euthyphro,de mirabilibus,...,theological orations,aratus,on the accession of alexander,barnabae epistulae,antony,theages,soleocista,metaphysics,funeral speech,against dionysodorus
ἀλκιβιάδης,-3.944954e-06,-5.43463e-06,2.461263e-06,-2.301668e-05,5.959946e-07,-5.8e-05,-5.199533e-06,-5.2e-05,-4.2e-05,-1.1e-05,...,-2.2e-05,-8.8e-05,-2.688404e-05,3e-06,5.405729e-05,-6.4e-05,-1.3e-05,-2.5e-05,-3.997762e-05,-2.856334e-06
κλεινίας,-1.02982e-07,-1.301442e-05,-1.419298e-07,-5.905346e-06,-2.101703e-05,-1.7e-05,-3.006209e-06,-9e-06,-3.1e-05,-8e-06,...,-9e-06,0.00019,-1.052343e-05,-4e-06,-2.434382e-05,-4.1e-05,-1.5e-05,-4e-06,-4.348343e-06,-7.494773e-07
παιδοτριβέω,-2.507748e-06,-2.276123e-06,-2.996161e-06,1.402864e-06,-4.356965e-07,-1.1e-05,-4.588213e-06,-1.1e-05,-2e-05,-4e-06,...,-6e-06,1e-06,-2.814627e-06,-1e-06,5.384534e-07,-2.5e-05,-7e-06,-1e-05,-3.014103e-06,-2.479363e-06
ἀπογίγνομαι,-5.516727e-06,-1.211754e-05,-7.680916e-06,-2.486613e-07,-3.863488e-06,0.000151,-7.718244e-06,-3.5e-05,-3e-05,-1.2e-05,...,8e-05,-1e-05,-5.496613e-07,-2e-05,-5.230489e-06,-2.6e-05,-5e-06,7.1e-05,-1.631326e-05,-9.371774e-06
παιδοτρίβης,-5.0185e-06,-1.210458e-05,-5.235803e-06,-1.149159e-05,-3.474965e-06,-1.3e-05,-1.057702e-05,3.2e-05,-2e-05,-1.2e-05,...,-1.2e-05,-1.4e-05,0.0002994689,-6e-06,2.036729e-06,-4.8e-05,-1e-05,-1e-05,-2.845319e-05,-2.246812e-05
σκυτικός,-1.546393e-07,-4.111311e-06,-2.110966e-06,8.154223e-07,-1.87354e-06,-1.6e-05,-3.760203e-06,-1.2e-05,-2e-05,-6e-06,...,-1.4e-05,2e-06,-2.683925e-06,-5e-06,8.420552e-07,-2.4e-05,-7e-06,9.3e-05,-3.666043e-06,-4.957468e-06
ἐμβλέπω,-1.438407e-06,1.956141e-06,1.042038e-06,-1.092476e-05,-3.729912e-05,-1.9e-05,-1.081128e-05,2.5e-05,-1.6e-05,-1.6e-05,...,-3.1e-05,-1.5e-05,-2.110594e-05,0.000134,-2.682232e-05,-2.3e-05,-2.5e-05,-2e-05,-6.436449e-06,-6.750422e-05
συμβούλευσις,-2.383237e-06,-2.063189e-06,-2.774578e-06,1.357152e-06,-3.896509e-07,-1e-05,-4.369283e-06,-1.1e-05,-1.9e-05,-4e-06,...,-6e-06,1e-06,-2.618065e-06,-1e-06,4.47055e-07,-2.3e-05,-6e-06,-9e-06,-2.71562e-06,-2.292673e-06
ὑπόδημον,1.361346e-06,-7.848884e-06,-2.130639e-06,-4.316334e-06,1.457856e-06,-1.9e-05,-4.484785e-06,3e-05,-1.9e-05,-8e-06,...,-2.6e-05,-7e-06,-2.585572e-06,-1.2e-05,-1.739021e-05,-2e-05,-7e-06,0.00011,-4.635907e-06,-6.573903e-06
εὐρυσάκης,-2.110268e-05,-3.165737e-05,-1.320904e-05,-6.409923e-06,2.426174e-06,-1.9e-05,-9.01187e-06,-1.2e-05,-2.3e-05,-4e-06,...,-5e-06,-1.4e-05,-1.627894e-06,1e-06,-1.167446e-05,-2.8e-05,-8e-06,-9e-06,-6.437356e-06,-1.188987e-06


In [34]:
projections.sort_values(by = names[0])

Unnamed: 0,meno
on the accession of alexander,-0.039028
titus flamininus,-0.026018
seven against thebes,-0.018808
de esu carnium,-0.018371
caesar,-0.017959
...,...
euthyphro,0.089123
symposium,0.098149
hippias maior,0.110177
theages,0.114719


# Read Maximus files

In [57]:
from docx import Document

folder = "/home/administrador/maximus-confessor/texts/maximus/"
document = Document(folder+"Agios Maximos.docx")
titles = []
f = open('test.tess','w')
for paragraph in document.paragraphs:
    if paragraph.style.name == "Heading 3":
        f.close()
        f = open(folder+'maximus_confessor.'+paragraph.text+'.tess','w')
        titles.append(paragraph.text)
        part = 0
        line = 0
    elif paragraph.style.name == "Heading 4":
        part += 1
        line = 0
    elif paragraph.style.name == "Normal":
        line +=1
        if part != 0:
            f.write('<'+titles[-1][:2]+' '+str(part)+'.'+str(line)+'>'+paragraph.text+'\n')
        else:
            f.write('<'+titles[-1][:2]+' '+str(line)+'>'+paragraph.text+'\n')
f.close()

Import into database

In [58]:
import os
import json

# Replace 'your_directory' with the path to your main directory containing .tess files
lang = 'greek'
main_directory = '/home/administrador/maximus-confessor/texts/maximus'

# Initialize an empty list to store metadata
metadata = []

# Iterate through all subdirectories and files
for filename in os.scandir(main_directory):
    if filename.is_file():
        if filename.path.endswith(".tess"):
            # Split the filename into parts based on a delimiter (.)
            parts = filename.name.split('.')

            # Extract metadata information based on the filename structure
            if len(parts) == 3:
                author = parts[0]
                title = ' '.join(parts[1].split('_'))
                file_path = os.path.join(main_directory, filename)

                # Add the metadata to the list
                metadata.append({'author': 'Maximus Confessor', 'title': title, 'path': file_path, 'language':lang, 'year':640})

# Define the output metadata file
metadata_file = 'data/maximus_texts.json'

# Write the metadata to a JSON file
with open(metadata_file, 'w') as file:
    json.dump(metadata, file, indent=4)

print(f"Metadata file '{metadata_file}' has been created.")


Metadata file 'data/maximus_texts.json' has been created.


In [59]:
with open('data/maximus_texts.json', 'r') as f:
    text_meta = json.load(f)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
print('{}{}{}{}'.format('-----'.ljust(15), '------'.ljust(15), '--------'.ljust(15), '----'))
for t in text_meta:
    print('{}{}{}{}'.format(t['title'].ljust(15), t['author'].ljust(15), t['language'].ljust(15), str(t['year']).ljust(15)))

texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))

Title          Author         Language       Year
-----          ------         --------       ----
ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ Maximus Confessorgreek          640            
ΠΕΡΙ ΤΩΝ ΠΡΑΧΘΕΝΤΩΝ ΕΝ ΤΗ ΠΡΩΤΗ ΑΥΤΟΥ ΕΞΟΡΙΑ, ΗΤΟΙ ΕΝ ΒΙΖΥΗ Maximus Confessorgreek          640            
ΕΠΙΣΤΟΛΑΙ ΚΟΙΝΑΙ, ΔΟΓΜΑΤΙΚΑΙ ΚΑΙ ΠΟΛΕΜΙΚΑΙMaximus Confessorgreek          640            
ΕΞΗΓΗΣΙΣ ΤΗΣ ΚΙΝΗΣΕΩΣ Maximus Confessorgreek          640            
ΣΥΖΗΤΗΣΙΣ ΤΟΥ ΑΓΙΟΥ  ΜΑΞΙΜΟΥ ΜΕΤΑ ΠΥΡΡΟΥ Maximus Confessorgreek          640            
ΣΧΟΛΙΑ ΕΙΣ ΤΑΣ ΕΠΙΣΤΟΛΑΣ ΤΟΥ ΑΓΙΟΥ ΔΙΟΝΥΣΙΟΥ ΑΡΕΟΠΑΓΙΤΟΥ Maximus Confessorgreek          640            
ΜΙΚΡΑ ΘΕΟΛΟΓΙΚΑ ΚΑΙ ΠΟΛΕΜΙΚΑMaximus Confessorgreek          640            
ΚΕΦΑΛΑΙΑ ΠΕΡΙ ΑΓΑΠΗΣMaximus Confessorgreek          640            
ΣΧΟΛΙΑ ΕΙΣ ΤΑ ΤΟΥ ΑΓΙΟΥ ΔΙΟΝΥΣΙΟΥ Maximus Confessorgreek          640            
ΕΡΜΗΝΕΙΑ ΣΥΝΤΟΜΟΣ ΕΙΣ ΤΗΝ ΠΡΟΣΕΥΧΗΝ ΤΟΥ ΠΑΤΕΡ ΗΜΩΝ,Maximus Confessorgreek          640            
ΚΕΦΑΛΑΙΑ Σ' ΠΕΡΙ ΘΕΟΛΟΓΙΑΣMaximus Confessorgreek  

In [60]:
for text in texts[:3]:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer(connection) if tessfile.metadata.language == 'greek' else LatinTokenizer(connection)
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)    
    result = connection.insert(features)
    result = connection.insert(tokens)

    #unitizer = Unitizer()
    #lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    #result = connection.insert(lines + phrases)    
    print(text.title)

ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ 
ΠΕΡΙ ΤΩΝ ΠΡΑΧΘΕΝΤΩΝ ΕΝ ΤΗ ΠΡΩΤΗ ΑΥΤΟΥ ΕΞΟΡΙΑ, ΗΤΟΙ ΕΝ ΒΙΖΥΗ 
ΕΠΙΣΤΟΛΑΙ ΚΟΙΝΑΙ, ΔΟΓΜΑΤΙΚΑΙ ΚΑΙ ΠΟΛΕΜΙΚΑΙ


In [61]:
tessfile = TessFile(texts[0].path, metadata=texts[0])

In [64]:
tessfile[0][:6]

'<ΠΡ 1>'

In [65]:
textd = {}
for line in tessfile:
    q = line[4:6]
    if q[1]=='>':
        q = q[0]
    try:
        textd[q] = textd[q]+[line]
    except:
        textd[q] = [line]

In [66]:
textd

{'1': ['<ΠΡ 1>ΤΑΔΕ ΕΝΕΣΤΙΝ ΕΝ Τῼ ΔΕ Τῼ ΠΡΩΤῼ ΒΙΒΛΙῼ ΤΩΝ ΑΠΟΡΩΝ ΤΗΣ ΘΕΙΑΣ ΓΡΑΦΗΣ\n'],
 '2': ['<ΠΡ 2>\n'],
 '3': ['<ΠΡ 3>ΜΑΞΙΜΟΥ ΜΟΝΑΧΟΥ\n'],
 '1.': ['<ΠΡ 1.1>\n',
  '<ΠΡ 1.2>Φυσικὸν μὲν τοῖς λογικοῖς πέφυκε κάλλος ὁ λόγος· λόγου δὲ κάλλος ἡ κατὰ τὸν λόγον ἀκριβὴς τῶν λογικῶν ἐστι σύνεσις· συνέσεως δὲ κάλλος ἐστὶν ἡ κατὰ τὴν ἀρετὴν σὺν λόγῳ τῶν λογικῶν γόνιμος ἕξις· ταύτης δὲ κάλλος τῆς ἕξεως ἡ περὶ τὴν ἀληθῆ γνῶσιν ἀπλανὴς θεωρία καθέστηκεν, ἧς τέλος ἐστὶν ἡ σοφία, συνέσεως ὑπάρχουσα σαφεστάτη συμπλήρωσις, οἷα δὴ λόγος τυγχάνουσα κατὰ φύσιν τετελεσμένος, ὅστις ἐστὶ νοῦς καθαρός, τῇ περὶ τὴν αἰτίαν ἑνώσει σχέσιν λαβὼν ὑπὲρ νόησιν· καθ᾽ ἥν, ἀποπαύσας τὴν πολυποίκιλον πρὸς τὰ μετὰ τὴν αἰτίαν φυσικὴν αὐτοῦ κίνησίν τε καὶ σχέσιν, μόνης ἀγνώστως ἀντέχεται, κατὰ τὴν ἄφθεγκτον λῆξιν γεγενημένος, τῆς ὑπὲρ νόησιν παμμακαρίστου σιγῆς, ἣν δηλῶσαι παντελῶς οὐ δύναται λόγος ἢ νόησις, ἀλλὰ μόνη κατὰ τὴν μέθεξιν ἡ πεῖρα τῶν ἀξιωθέντων τῆς ὑπὲρ νόησιν ἀπολαύσεως, ἧς σημεῖόν ἐστιν εὔγνωστόν τε καὶ πᾶσι κ

# Reset database

In [10]:
connection.connection['feature_sets'].delete_many({})
connection.connection['features'].delete_many({})
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['match_sets'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

<pymongo.results.DeleteResult at 0x7f85a2dffdc0>

# Create metadata file

See process_texts.py