This notebook assumes that the texts of interest have already been converted to .tess files, with their metadata recorded in a .json file. Here, we will unitize, tokenize and lemmatize the texts, and create vector embeddings and draft translations.

In [1]:
import json

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Text
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer

connection = TessMongoConnection('127.0.0.1', 27017, None, None, db='maximus')

# Import into database

In [15]:
with open('data/maximus_texts.json', 'r') as f:
    meta_maximus = json.load(f)
with open('data/maximus_sources.json', 'r') as f:
    meta_sources = json.load(f)

#Texts
texts = []
for t in meta_maximus+meta_sources:
    if len(connection.find('texts',title=t['title'],author=t['author'])) == 0:
        texts.append(Text.json_decode(t))
result = connection.insert(texts)
if len(texts)>0:
    print('Inserted {} texts.'.format(len(result.inserted_ids)))

for text in texts:
    tessfile = TessFile(text.path, metadata=text)
    if text.language == 'greek':
        tokenizer = GreekTokenizer(connection)
    elif text.language == 'latin':
        tokenizer = LatinTokenizer(connection)
    else:
        print('language not recognized')

    #Features
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)  
    result = connection.insert(features)
    result = connection.update(features)

    #Units
    unitizer = Unitizer()
    lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    result = connection.insert(lines + phrases)    

    #Tokens
    chunk = 100000
    for k in range((len(tokens)//chunk)+1):
        result = connection.insert(tokens[k*chunk:(k+1)*chunk])
    print(text.title)

# Create Embeddings

In [12]:
from tesserae.db.entities import Vector
import pandas as pd
from openai import OpenAI
import time

with open('api_key.txt','r') as f:
    api_key = f.read()

def embed_text(text_id,unit_type='line',model="text-embedding-3-small"):
    text_df = pd.DataFrame(connection.aggregate('units',[{'$match': {'text': text_id, 'unit_type': unit_type}}, 
                                               {"$project": {"_id": 0, "index": 1, "tags": 1, "unit_type": 1, "snippet": 1}}],
                                              encode=False))
    lengths = text_df['snippet'].str.len()
    text_df = text_df.loc[lengths>16]
    client = OpenAI(api_key=api_key)

    chunk=1000
    for k in range((len(text_df)//chunk)+1):
        text_chunk = text_df.iloc[k*chunk:(k+1)*chunk]
        embeddings = client.embeddings.create(input = list(text_chunk['snippet'].values), model=model).data
        result = connection.insert([Vector(text=text_id,model=model,vector=embeddings[n].embedding,**text_chunk.T.drop('snippet').T.iloc[n]) for n in range(len(embeddings))])
    
    return result

In [13]:
for text in connection.find('texts'):
    if len(connection.find('vectors',text=text.id,unit_type='line')) == 0:
        out1 = embed_text(text.id,unit_type='line')
        print(text.title+' line')
    if len(connection.find('vectors',text=text.id,unit_type='phrase')) == 0:
        out2 = embed_text(text.id,unit_type='phrase')
        print(text.title+' phrase')

Old Testament line
Old Testament phrase
Isagoge line
Isagoge phrase
Categories line
Categories phrase
