# Plan

1. Fix insert so that it correctly updates existing entries (workaround for now by updating after inserting)
2. Update get_text_frequencies to extract existing frequencies data (two lines of code, in LAP section of other notebook)
3. See how hard it is to use visualization tools from https://github.com/vierth/chinesetextreuse (and maybe his algorithm too, if it's faster than Smith-Waterman)
4. Fix formatting of Maximus texts (the tags are all messed up, maybe because they include Greek characters)

# Import and connect

In [1]:
import json
from bson import ObjectId
import pandas as pd

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers.sparse_encoding import SparseMatrixSearch
from tesserae.utils.calculations import get_text_frequencies

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesstest')
connection_cb = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesserae_cb')

# Load texts

Create metadata file

In [2]:
import os
import json

# Replace 'your_directory' with the path to your main directory containing .tess files
lang = 'greek'
main_directory = '/home/administrador/maximus-confessor/texts/grc'

# Initialize an empty list to store metadata
metadata = []

# Iterate through all subdirectories and files
for filename in os.scandir(main_directory):
    if filename.is_file():
        if filename.path.endswith(".tess"):
            # Split the filename into parts based on a delimiter (.)
            parts = filename.name.split('.')

            # Extract metadata information based on the filename structure
            if len(parts) == 3:
                author = parts[0]
                title = ' '.join(parts[1].split('_'))
                file_path = os.path.join(main_directory, filename)

                # Add the metadata to the list
                metadata.append({'author': author, 'title': title, 'path': file_path, 'language':lang, 'year':0})

# Define the output metadata file
metadata_file = 'data/text_metadata_'+lang+'.json'

# Write the metadata to a JSON file
with open(metadata_file, 'w') as file:
    json.dump(metadata, file, indent=4)

print(f"Metadata file '{metadata_file}' has been created.")


Metadata file 'data/text_metadata_greek.json' has been created.


Insert metadata into database

In [13]:
with open('data/text_metadata_greek.json', 'r') as f:
    text_meta = json.load(f)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
print('{}{}{}{}'.format('-----'.ljust(15), '------'.ljust(15), '--------'.ljust(15), '----'))
for t in text_meta:
    print('{}{}{}{}'.format(t['title'].ljust(15), t['author'].ljust(15), t['language'].ljust(15), str(t['year']).ljust(15)))

texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))

Title          Author         Language       Year
-----          ------         --------       ----
seven against thebesaeschylus      greek          0              
oedipus tyrannussophocles      greek          0              
bacchae        euripides      greek          0              
epitome        apollodorus    greek          0              
for phormio    demosthenes    greek          0              
consolatio ad apolloniumplutarch       greek          0              
hymns          callimachus    greek          0              
epistulae      basil_of_caesareagreek          0              
euthyphro      plato          greek          0              
de mirabilibus phlegon        greek          0              
clouds         aristophanes   greek          0              
olynthiac 3    demosthenes    greek          0              
on the embassy aeschines      greek          0              
de iside et osirideplutarch       greek          0              
against eubulidesdemosthe

Tokenize the texts

In [12]:
for text in texts[:3]:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer(connection) if tessfile.metadata.language == 'greek' else LatinTokenizer(connection)
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)    
    result = connection.insert(features)
    result = connection.insert(tokens)

    #unitizer = Unitizer()
    #lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    #result = connection.insert(lines + phrases)    
    print(text.title)

seven against thebes
oedipus tyrannus
bacchae


Remove failed texts:

In [17]:
connection_cb = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesserae_cb')

failed = ['orationes',
'antiquitates judaicae',
'iphigenia aulis',
'histories',
'antiquitates romanae',
'deipnosophists',
'geography']

connection_cb.connection['texts'].delete_many({'title': {'$in':failed}})

<pymongo.results.DeleteResult at 0x7f2816a75310>

Update paths

In [35]:
texts = connection_cb.find('texts')
for t in texts:
    p_old = t.path.split('/')
    p_new = '/'.join(p_old[:3]+['maximus-confessor']+p_old[4:])
    t.path = p_new
result = connection_cb.update(texts)

# Test lemmatization

Load the list of texts

In [142]:
texts = connection.find('texts')

Load the tokens from one of the texts

In [272]:
tokens = connection.find('tokens',text=texts[0].id)

In [273]:
len(tokens)

11896

Look up the lemma for the token:

In [121]:
print(tokens[0].display)
lem = connection.find('features',_id=tokens[0].features['lemmata'][0])
print(lem[0].token)

ἥκω
ἥκω


Look at some lemmas

In [15]:
connection.find('features',feature='lemmata')[:5]

[Feature(language=greek, feature=lemmata, token=ὄρνυμι, index=411, frequencies={'6509797141de82c578e8fd64': 4}),
 Feature(language=greek, feature=lemmata, token=σας, index=2065, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=τέχνη, index=162, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=κλαυστός, index=978, frequencies={'6509797141de82c578e8fd64': 1}),
 Feature(language=greek, feature=lemmata, token=τώς, index=1310, frequencies={'6509797141de82c578e8fd64': 2})]

Find the corresponding text

In [97]:
connection.find('texts',_id=ObjectId('6509797141de82c578e8fd64'))

[Text(language=greek, title=seven against thebes, author=aeschylus, year=0, ingestion_status=['Initialized', ''], ingestion_details={}, path=/home/administrador/tesserae/texts/grc/aeschylus.seven_against_thebes.tess, is_prose=False)]

# Frequencies

Now implemented in my branch of tesserae-v5, in utils.calculations

In [143]:
def get_text_frequencies(connection,text,feature='lemmata'):
    result = connection.aggregate('tokens',[{'$match': {'text': ObjectId(text)}}, 
                                            {"$project": {"_id": 0, "features": 1}},
                                            {"$unwind": "$features."+feature},
                                            {'$group' : {"_id" : "$features."+feature , text : {"$sum": 1}}}],
                                  encode=False)
    freqs = pd.DataFrame(list(result)).set_index("_id")

    return freqs

In [133]:
text = '6509797141de82c578e8fd64'
freqs = get_text_frequencies(connection,text)

## Extract frequencies from "lemmata" entities

In [9]:
result = connection.aggregate('features',[ {'$match': { 'feature': 'lemmata' }}, 
                                          { "$project": {"data": { "$objectToArray": "$frequencies" }}},
                                          {'$unwind': '$data'}],
                     encode=False)

In [10]:
freqs = pd.DataFrame()
for element in result:
    freqs.loc[element['_id'],element['data']['k']] = element['data']['v']
freqs.fillna(value=0).min(axis=1).sort_values(ascending=False).iloc[:50]

@ 

# Try LAP

Import frequencies

In [22]:
authors = ['plato', 'aristotle', 'gregory_of_nazianzus', 'basil_of_caesarea']

In [23]:
texts = connection_cb.find('texts',author=authors[0])
freqs = pd.DataFrame(get_text_frequencies(connection_cb,str(texts[0].id)))
texts = connection_cb.find('texts',author=authors[1])
freqs=freqs.join(get_text_frequencies(connection_cb,str(texts[0].id)),how='inner')

for author in authors[2:]:
    texts = connection_cb.find('texts',author=authors[0])
    for text in texts:
        freqs=freqs.join(get_text_frequencies(connection_cb,str(text.id)),how='inner')
freqs = freqs.fillna(value=0).astype('int64')

NameError: name 'text' is not defined

In [8]:
freqs

Unnamed: 0,euthyphro,metaphysics
πλήν,1,32
δεσμός,2,4
πάν,6,66
διδασκαλικός,1,2
νυνί,2,1
...,...,...
οὐδείς,14,346
βελτίων,6,17
ἀκίνητος,1,49
ἄδηλος,1,14


In [19]:
freqs.to_excel('frequencies.xlsx')

Add labels

In [7]:
lemma_labels=[]
for idx in freqs.index:
    lemma_labels.append(connection_cb.find('features',_id=idx)[0].token)
freqs.index = lemma_labels

text_labels=[]
for t in freqs.keys():
    text_labels.append(connection_cb.find('texts',_id=ObjectId(t))[0].title)
freqs.columns = text_labels

In [32]:
import scripts.lap_v2_py3 as lap_v2
import numpy as np

names = ['meno']

freqs_norm = freqs.copy()
N = np.shape(freqs)[0]
for item in freqs:
    freqs_norm[item] = lap_v2.rank_norm(np.asarray(freqs[item]), dist='normal',norm=N)

data = freqs_norm[names]
basis = freqs_norm.T.drop(names).T

[a, A, eta] = lap_v2.lap(basis.values, data.values, full_output=True)

projections = pd.DataFrame(data = a, index = basis.keys(), columns = names)
eta = pd.DataFrame(data = eta.T, index = data.index, columns = basis.keys())

In [None]:
eta.sort_values(by='alcibiades 1',ascending=False).iloc[:30]

In [34]:
projections.sort_values(by = names[0])

Unnamed: 0,meno
on the accession of alexander,-0.039028
titus flamininus,-0.026018
seven against thebes,-0.018808
de esu carnium,-0.018371
caesar,-0.017959
...,...
euthyphro,0.089123
symposium,0.098149
hippias maior,0.110177
theages,0.114719


# Read Maximus files

In [1]:
from docx import Document

folder = "/home/administrador/maximus-confessor/texts/maximus/"
document = Document(folder+"Agios Maximos.docx")
titles = []
f = open('test.tess','w')
for paragraph in document.paragraphs:
    if paragraph.style.name == "Heading 3":
        f.close()
        f = open(folder+'maximus_confessor.'+paragraph.text+'.tess','w')
        titles.append(paragraph.text)
        part = 0
        line = 0
    elif paragraph.style.name == "Heading 4":
        part += 1
        line = 0
    elif paragraph.style.name == "Normal":
        line +=1
        if part != 0:
            f.write('<'+titles[-1][:2]+' '+str(part)+'.'+str(line)+'> '+paragraph.text+'\n')
        else:
            f.write('<'+titles[-1][:2]+' '+str(line)+'> '+paragraph.text+'\n')
f.close()

Special treatment for Ad Thalassium:

In [40]:
from docx import Document

folder = "/home/administrador/maximus-confessor/texts/maximus/"
document = Document(folder+"Agios Maximos.docx")
titles = []
f = open('test.tess','w')
is_thal = 0
for paragraph in document.paragraphs:
    if paragraph.style.name == "Heading 3":
        f.close()
        f = open(folder+'maximus_confessor.'+paragraph.text+'.tess','w')
        titles.append(paragraph.text)
        title = titles[-1][:2]
        if paragraph.text == 'ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ':
            is_thal = 1
            title = 'ad. thal.'
        elif paragraph.text == 'ΠΕΡΙ ΔΙΑΦΟΡΩΝ ΑΠΟΡΙΩΝ ':
            is_thal = 0
        part = 0
        line = 0
    elif paragraph.style.name == "Heading 4":
        part += 1
        line = 0
    elif paragraph.style.name == "Normal":
        if len(paragraph.text)>1:
            line +=1
        if is_thal and part == 1:
            part_name = 'prol'
        elif is_thal and part == 2:
            part_name = 'epist'
        elif is_thal and part > 2:
            part_name = part-2
        else:
            part_name = part
        if part != 0 and len(paragraph.text)>1:
            f.write('<'+title+' '+str(part_name)+'.'+str(line)+'> '+paragraph.text+'\n')
        elif part == 0:
            f.write('<'+title+' '+str(line)+'> '+paragraph.text+'\n')
    elif paragraph.style.name != "Heading 5":
        print(title)
        print(paragraph.style.name)
f.close()

ad. thal.
Body Text Indent 3
ΠΕ
Body Text Indent 3
ΠΕ
Body Text Indent 3
ΣΧ
Heading 6
ΕΠ
Body Text Indent 3
ΕΠ
Body Text Indent 3
ΕΠ
Body Text Indent 3
ΚΕ
Body Text Indent 3
ΚΕ
Body Text Indent 3
ΠΡ
Heading 2


In [5]:
from docx import Document

folder = "/home/administrador/maximus-confessor/texts/maximus/"
document = Document(folder+"Mystagogy.docx")
titles = []
f = open('test.tess','w')
for paragraph in document.paragraphs:
    if paragraph.style.name == "Heading 3":
        f.close()
        f = open(folder+'maximus_confessor.'+paragraph.text+'.tess','w')
        titles.append(paragraph.text)
        title = titles[-1][:2]
        part = 0
        line = 0
    elif paragraph.style.name == "Heading 4":
        part += 1
        line = 0
    elif paragraph.style.name == "Normal":
        if len(paragraph.text)>1:
            line +=1
        if part == 1:
            part_name = 'prol'
        elif part > 1:
            part_name = part-1
        if part != 0 and len(paragraph.text)>1:
            f.write('<'+title+' '+str(part_name)+'.'+str(line)+'> '+paragraph.text+'\n')
        elif part == 0:
            f.write('<'+title+' '+str(line)+'> '+paragraph.text+'\n')
    else:
        print(paragraph.style.name)
f.close()

Import into database

In [8]:
import os
import json

# Replace 'your_directory' with the path to your main directory containing .tess files
lang = 'greek'
main_directory = '/home/administrador/maximus-confessor/texts/maximus'

# Initialize an empty list to store metadata
metadata = []

# Iterate through all subdirectories and files
for filename in os.scandir(main_directory):
    if filename.is_file():
        if filename.path.endswith(".tess"):
            # Split the filename into parts based on a delimiter (.)
            parts = filename.name.split('.')

            # Extract metadata information based on the filename structure
            if len(parts) == 3:
                author = parts[0]
                title = ' '.join(parts[1].split('_'))
                file_path = os.path.join(main_directory, filename)

                # Add the metadata to the list
                metadata.append({'author': 'Maximus Confessor', 'title': title, 'path': file_path, 'language':lang, 'year':640})

# Define the output metadata file
metadata_file = 'data/maximus_texts.json'

# Write the metadata to a JSON file
with open(metadata_file, 'w') as file:
    json.dump(metadata, file, indent=4)

print(f"Metadata file '{metadata_file}' has been created.")


Metadata file 'data/maximus_texts.json' has been created.


# Reset database

In [69]:
connection.connection['feature_sets'].delete_many({})
connection.connection['features'].delete_many({})
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['match_sets'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

<pymongo.results.DeleteResult at 0x7f859b89de50>

# Create metadata file

See process_texts.py