The goal of this notebook is to illustrate the possibilities of the tesserae package in combination with various bioinformatics tools in identifying intertextual references in Maximus the Confessors's _Quaestiones ad Thalassium_. 

I want to start with LAP and BLAST. But BLAST may cause difficulty, because it is impossible to render the texts in FASTA format (unless you ignore the tokenization and just run everything together -- but you still need to convert to Latin characters).

In [1]:
import json
from bson import ObjectId
import pandas as pd

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers.sparse_encoding import SparseMatrixSearch
from tesserae.utils.calculations import get_text_frequencies

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, db='tesstest')

# Reset and load texts

In [2]:
connection.connection['features'].delete_many({})
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

with open('data/maximus_comp_test.json', 'r') as f:
    text_meta = json.load(f)

texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))

for text in texts:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer(connection)
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)    
    result = connection.insert(features)
    result = connection.insert(tokens)

    unitizer = Unitizer()
    lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    result = connection.insert(lines + phrases)    
    print(text.title)

Inserted 4 texts.


# Linear Algebra Projections

In [15]:
texts = connection.find('texts')
freqs = pd.DataFrame(get_text_frequencies(connection,str(texts[0].id)))

for text in texts[1:]:
    freqs=freqs.join(get_text_frequencies(connection,str(text.id)),how='inner')
freqs = freqs.fillna(value=0).astype('int64')

lemma_labels=[]
for idx in freqs.index:
    lemma_labels.append(connection_cb.find('features',_id=idx)[0].token)
freqs.index = lemma_labels

text_labels=[]
for t in freqs.keys():
    text_labels.append(connection_cb.find('texts',_id=ObjectId(t))[0].title)
freqs.columns = text_labels

KeyError: "None of ['_id'] are in the columns"

In [None]:
freqs

In [None]:
import scripts.lap_v2_py3 as lap_v2
import numpy as np

names = ['meno']

freqs_norm = freqs.copy()
N = np.shape(freqs)[0]
for item in freqs:
    freqs_norm[item] = lap_v2.rank_norm(np.asarray(freqs[item]), dist='normal',norm=N)

data = freqs_norm[names]
basis = freqs_norm.T.drop(names).T

[a, A, eta] = lap_v2.lap(basis.values, data.values, full_output=True)

projections = pd.DataFrame(data = a, index = basis.keys(), columns = names)
eta = pd.DataFrame(data = eta.T, index = data.index, columns = basis.keys())

# Multiple Sequence Alignment