The goal of this notebook is to illustrate the possibilities of the tesserae package in combination with various bioinformatics tools in identifying intertextual references in Maximus the Confessors's _Quaestiones ad Thalassium_. 

In [103]:
import json
from bson import ObjectId
import pandas as pd

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers.sparse_encoding import SparseMatrixSearch
from tesserae.utils.calculations import get_text_frequencies

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesstest')

# Reset and load texts

In [29]:
connection.connection['features'].delete_many({})
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

with open('data/maximus_comp_test.json', 'r') as f:
    text_meta = json.load(f)

texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))

for text in texts:
    tessfile = TessFile(text.path, metadata=text)

    tokenizer = GreekTokenizer(connection)
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)    
    result = connection.insert(features)
    result = connection.update(features)

    unitizer = Unitizer()
    lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    result = connection.insert(lines + phrases)    

    result = connection.insert(tokens)
    
    print(text.title)

Inserted 4 texts.
ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ
New Testament
Historia Ecclesiastica
Metaphysics


# Linear Algebra Projections

Load frequencies. 

In [104]:
result = connection.aggregate('features',[ {'$match': { 'feature': 'lemmata' }}, 
                                          { "$project": {"data": { "$objectToArray": "$frequencies" }}},
                                          {'$unwind': '$data'},
                                          {'$project' : {
                                                '_id':1,
                                                'text' : '$data.k',
                                                'freq' : '$data.v' }}],
                     encode=False)
freqs = pd.DataFrame.from_dict(list(result)).pivot_table(values='freq',index=['_id'],columns=['text'],fill_value=0)

Load the labels. (lemma labels are slow -- should implement directly in mongodb query)

In [105]:
#lemma_labels=[]
#for idx in freqs.index:
#    lemma_labels.append(connection.find('features',_id=idx)[0].token)
#freqs.index = lemma_labels

text_labels=[]
for t in freqs.keys():
    text_labels.append(connection.find('texts',_id=ObjectId(t))[0].title)
freqs.columns = text_labels

Make projections

In [107]:
import scripts.lap_v2_py3 as lap_v2
import numpy as np

names = ['ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ']

freqs_norm = freqs.copy()
N = np.shape(freqs)[0]
for item in freqs:
    freqs_norm[item] = lap_v2.rank_norm(np.asarray(freqs[item]), dist='normal',norm=N)

data = freqs_norm[names]
basis = freqs_norm.T.drop(names).T

[a, A, eta] = lap_v2.lap(basis.values, data.values, full_output=True)

projections = pd.DataFrame(data = a, index = basis.keys(), columns = names)
eta = pd.DataFrame(data = eta.T, index = data.index, columns = basis.keys())

In [108]:
projections

Unnamed: 0,ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ
New Testament,0.253213
Historia Ecclesiastica,0.075002
Metaphysics,0.197027


# Sequence Alignment

Get list of lemmas from unit

In [120]:
def get_lemma_seq(title,**kwargs):
    #possible kwargs: tags, index, unit_type
    text = connection.find('texts',title = title)[0].id
    kwargs['text'] = text
    result = connection.aggregate('units',[{'$match': kwargs}, 
                                       {"$project": {"_id": 0, "tokens.display": 1, "tokens.features.lemmata": 1}},
                                       {"$unwind": "$tokens"},
                                       {"$project": {"token": '$tokens.display', "lemma" : { "$arrayElemAt" : ["$tokens.features.lemmata", 0]}}}],
                                  encode=False)
    return pd.DataFrame.from_dict(result).set_index('token').squeeze()

A simple Smith-Waterman implementation, courtesy of ChatGPT. Extremely inefficient, bould be sped up significantly.

In [129]:
def smith_waterman(seq1_ser, seq2_ser, match=2, mismatch=-1, gap_penalty=-1, traceback = True):
    seq1 = list(seq1_ser.values)
    seq2 = list(seq2_ser.values)
    idx1 = list(seq1_ser.keys())
    idx2 = list(seq2_ser.keys())
    
    # Create a matrix to store the scores for each position in the sequences
    rows, cols = len(seq1) + 1, len(seq2) + 1
    score_matrix = [[0 for _ in range(cols)] for _ in range(rows)]

    # Initialize the maximum score and its position
    max_score = 0
    max_i, max_j = 0, 0

    # Fill in the score matrix
    for i in range(1, rows):
        for j in range(1, cols):
            if seq1[i - 1] == seq2[j - 1]:
                match_score = score_matrix[i - 1][j - 1] + match
            else:
                match_score = score_matrix[i - 1][j - 1] + mismatch

            delete_score = score_matrix[i - 1][j] + gap_penalty
            insert_score = score_matrix[i][j - 1] + gap_penalty

            score_matrix[i][j] = max(0, match_score, delete_score, insert_score)

            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]
                max_i, max_j = i, j

    if traceback:
        # Traceback to find the aligned sequences
        aligned_seq1, aligned_seq2 = [], []
        i, j = max_i, max_j
    
        while score_matrix[i][j] > 0:
            if score_matrix[i][j] == score_matrix[i - 1][j - 1] + (match if seq1[i - 1] == seq2[j - 1] else mismatch):
                aligned_seq1.insert(0, idx1[i - 1])
                aligned_seq2.insert(0, idx2[j - 1])
                i -= 1
                j -= 1
            elif score_matrix[i][j] == score_matrix[i - 1][j] + gap_penalty:
                aligned_seq1.insert(0, idx1[i - 1])
                aligned_seq2.insert(0, "-")
                i -= 1
            else:
                aligned_seq1.insert(0, "-")
                aligned_seq2.insert(0, idx2[j - 1])
                j -= 1

        aligned_seq1 = ' '.join(aligned_seq1)
        aligned_seq2 = ' '.join(aligned_seq2)
        return aligned_seq1, aligned_seq2, max_score
    else:
        return max_score

# Example usage:
seq1 = get_lemma_seq('ΠΡΟΣ ΘΑΛΑΣΣΙΟΝ',tags='2.2',unit_type='phrase')
seq2 = get_lemma_seq('New Testament',tags='John.5.17',unit_type='phrase')
aligned_seq1, aligned_seq2, score = smith_waterman(seq1, seq2, match=2, mismatch=-1, gap_penalty=-1)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)
print("Alignment Score:", score)

Aligned Sequence 1: Πατήρ μου ἕως ἄρτι ἐργάζεται κἀγὼ ἐργάζομαι
Aligned Sequence 2: πατήρ μου ἕως ἄρτι ἐργάζεται κἀγὼ ἐργάζομαι
Alignment Score: 14


Next steps:
1. Extend get_lemma_seq function to return the lemma sequences for the entire text at once, as a list of dicts, of the form {token=['lorem','ipsum','tempus'], lemma=[1,4,2]}. Need to modify sequence alignment algorithm too, to accept the new format.
2. improve efficiency of sequence alignment
3. write script to search an entire text for matches (maybe start by finding all pairs that have some minimal overlap of vocabulary)
4. return more context

# Arc Diagram and Chord Diagram

Both of these can be easily constructed using networkx, based on a graph object (https://ericmjl.github.io/Network-Analysis-Made-Simple/01-introduction/03-viz/). 

I also discovered this library called "Bokeh," which can make the graphs interactive: https://docs.bokeh.org/en/latest/index.html

Next steps: 
1. Use MongoDB query to get list of all tags, (or maybe title-tag pairs), to use as the nodes
2. Use the results of the previous section to return a list of tuples of two title-tag pairs, indicating all alignments over a certain score threshold
3. Make arc and chord plots
4. Interactive: use Bokeh to display the alignment when you hover over an edge