In [12]:
import polars as pl

df = pl.read_parquet("./messengers_by_play_and_speaker_grouped_text.parquet")

In [27]:
sentences_df = df.with_columns(joined_text=pl.col("text").list.join(" ")).with_columns(
    sentences=pl.col("joined_text")
    .str.replace("·", ".")
    .str.strip_chars()
    .str.split(".")
)

In [28]:
from sentence_transformers import SentenceTransformer

model_name = "bowphs/SPhilBerta"
model = SentenceTransformer(model_name)

In [None]:
def embed(sentences):
    return model.encode(sentences)


embeddings_df = sentences_df.with_columns(
    embeddings=pl.col("sentences").map_batches(embed, return_dtype=pl.Array(pl.Float32, (768,))),
    row_id=pl.concat_str(
        pl.col("dramatist"), 
        pl.col("title"), 
        pl.col("speaker"), 
        pl.col("n").list.first(),
        separator=" "
    )
)

In [81]:
similarities = model.similarity(
    embeddings_df["embeddings"].to_numpy(), embeddings_df["embeddings"].to_numpy()
)

for row_id, similarity in enumerate(similarities):
    print(embeddings_df.row(row_id, named=True)['row_id'])

    print("\t\t has similarities with:\n")

    for row_id2, score in enumerate(similarity):
        print(f"\t\t\t\t{embeddings_df.row(row_id2, named=True)['row_id']} - {score}")

Sophocles Ajax Ἄγγελος 719
		 has similarities with:

				Sophocles Ajax Ἄγγελος 719 - 0.9999995827674866
				Sophocles Antigone Ἐξάγγελος 1278 - 0.5562293529510498
				Sophocles Trachiniae Ἄγγελος 180 - 0.6000794768333435
				Euripides Rhesus Ἄγγελος 284 - 0.42008039355278015
				Euripides Medea Ἄγγελος 1136 - 0.591525137424469
				Euripides Bacchae Ἄγγελος 677 - 0.6466456651687622
				Euripides Hippolytus Ἄγγελος 1173 - 0.6978393197059631
				Euripides Ion Θεράπων 1123 - 0.678095281124115
				Aeschylus Agamemnon Κῆρυξ 503 - 0.6042507290840149
				Euripides Heracleidae Θεράπων 799 - 0.7570751905441284
				Euripides Rhesus Ἡνίοχος 756 - 0.4532187283039093
				Sophocles Antigone Ἄγγελος 1155 - 0.5517498254776001
				Sophocles Trachiniae Ἡρακλῆς 1046 - 0.631926953792572
				Euripides Orestes Ἄγγελος 866 - 0.6436662673950195
				Sophocles Trachiniae Λίχας 248 - 0.5587062239646912
				Sophocles Electra Παιδαγωγός 680 - 0.6853230595588684
				Euripides Andromache Ἄγγελος 1085 - 0.6131355166