In [1]:
import polars as pl

df = pl.read_parquet("./messengers_by_play_and_speaker_grouped_text.parquet")

In [2]:
sentences_df = df.with_columns(joined_text=pl.col("text").list.join(" ")).with_columns(
    sentences=pl.col("joined_text")
    .str.replace("Â·", ".")
    .str.strip_chars()
    .str.split(".")
)

In [3]:
from sentence_transformers import SentenceTransformer

model_name = "bowphs/SPhilBerta"
model = SentenceTransformer(model_name)

In [4]:
def embed(sentences):
    return model.encode(sentences)

embeddings_df = sentences_df.with_columns(
    embeddings=pl.col("sentences").map_batches(embed, return_dtype=pl.Array(pl.Float32, (768,))),
    row_id=pl.concat_str(
        pl.col("dramatist"), 
        pl.col("title"), 
        pl.col("speaker"), 
        pl.col("n").list.first(),
        separator=" "
    )
).sort(pl.col("year"), pl.col("n").list.first())

In [5]:
messenger_embeddings = []

for row in sentences_df.iter_rows(named=True):
    sentences = [s for s in row["sentences"] if s.strip() != ""]

    try:
        embeddings = model.encode(sentences)
        messenger_embeddings.append(
            {"id": f"{row['dramatist']} {row['title']} {row['speaker']}", "embeddings": embeddings.tolist(), "year": row['year']}
        )
    except IndexError:
        print(f"Index error on {row['dramatist']} {row['title']}")
        continue

In [6]:
similarities = model.similarity(
    embeddings_df["embeddings"].to_numpy(), embeddings_df["embeddings"].to_numpy()
)

def to_comparison_dataframe():
    rows = {}

    for row_idx, similarity in enumerate(similarities):
        row = embeddings_df.row(row_idx, named=True)

        rows[row['row_id']] = similarity

    comparisons = pl.DataFrame(rows)

    return comparisons

def to_plotable_comparison():
    raw: dict[str, list[str | float]] = {'speech_1': [], 'speech_2': [], 'similarity': []}

    for row_idx, similarity in enumerate(similarities):
        row = embeddings_df.row(row_idx, named=True)

        for row_idx2, score in enumerate(similarity):
            corresp_row = embeddings_df.row(row_idx2, named=True)

            raw['speech_1'].append(row['row_id'])
            raw['speech_2'].append(corresp_row['row_id'])
            raw['similarity'].append(float(score))

    return pl.DataFrame(raw)


In [9]:
import altair as alt

plotable = to_plotable_comparison()

alt.Chart(plotable).mark_rect().encode(
    x=alt.X("speech_2:N", sort=None),
    y=alt.Y("speech_1:N", sort=None),
    color=alt.Color("similarity:Q").scale(scheme="yelloworangebrown", domain=[0.3, 1]),
    tooltip=["speech_1:N", "speech_2:N", "similarity:Q"],
).properties(
    title="Messenger speech similarity"
)

In [None]:
alt.Chart(plotable).mark_rect().encode(
    x=alt.X("speech_2:N"),
    y=alt.Y("speech_1:N"),
    color="similarity:Q",
    tooltip=["speech_1:N", "speech_2:N", "similarity:Q"],
).properties(
    title="Messenger speech similarity, alphabetical order"
)

In [None]:
from scipy.stats import median_abs_deviation, quantile_test

quantile_test(plotable['similarity'], q=0.5, p=0.5) # pyright: ignore

QuantileTestResult(statistic=np.int64(124), statistic_type=1, pvalue=np.float64(3.0002331794934075e-233))

In [67]:
median_abs_deviation(plotable['similarity'])

np.float64(0.06188809871673584)

## Approach 2 (mirrors approach in tragedy-embeddings.ipynb)

In [2]:
import json

import numpy as np
import polars as pl
import torch

from sentence_transformers import SentenceTransformer

model_name = "bowphs/SPhilBerta"
model = SentenceTransformer(model_name)

model.max_seq_length = 256

df = pl.read_parquet("./messengers_by_play_and_speaker_grouped_text.parquet")

In [3]:
sentences_df = df.with_columns(joined_text=pl.col("text").list.join(" ")).with_columns(
    sentences=pl.col("joined_text")
    .str.replace("Â·", ".")
    .str.replace(";", ".")
    .str.strip_chars()
    .str.split(".")
)

In [None]:
from pathlib import Path

def calculate_embeddings():
    embeddings_file = Path("./messenger-embeddings.json")

    # if embeddings_file.exists():
    #     with open(embeddings_file) as f:
    #         return json.load(f)

    messenger_embeddings = []

    for row in sentences_df.iter_rows(named=True):
        sentences = [s for s in row["sentences"] if s.strip() != ""]

        try:
            embeddings = model.encode(sentences)
            messenger_embeddings.append(
                {
                    "id": f"{row['dramatist']} {row['title']} {row['speaker']}",
                    "embeddings": embeddings.tolist(),
                    "year": row["year"],
                    "n": row["n"][0]
                }
            )
        except IndexError:
            print(f"Index error on {row['dramatist']} {row['title']}")
            continue

    with open(embeddings_file, "w") as f:
        json.dump(messenger_embeddings, f, ensure_ascii=False)

    return messenger_embeddings


messenger_embeddings = calculate_embeddings()

all_embeddings = []

for m in messenger_embeddings:
    all_embeddings.append(
        {
            "id": m["id"],
            "embeddings": np.asarray(m["embeddings"], dtype=np.float32),
            "year": m["year"],
            "n": m["n"]
        }
    )

In [None]:
comparisons = []

sorted_embeddings = sorted(all_embeddings, key=lambda x: (x["year"], x["n"]))

for emb in sorted_embeddings:
    for i, _ in enumerate(sorted_embeddings):
        comparisons.append([emb, sorted_embeddings[i]])

In [None]:
similarities: dict[str, list[str | float]] = {
    "speaker_1": [],
    "speaker_2": [],
    "similarity": [],
}

for comparison in comparisons:
    similarity_by_sentence = model.similarity(
        comparison[0]["embeddings"], comparison[1]["embeddings"]
    )

    max_similarities = [row.max() for row in similarity_by_sentence]
    mean_similarity = torch.Tensor(max_similarities).mean()

    similarities["speaker_1"].append(comparison[0]["id"])
    similarities["speaker_2"].append(comparison[1]["id"])
    similarities["similarity"].append(float(mean_similarity))

[1.0000001192092896, 0.7244861721992493, 0.7377272248268127, 0.6755437254905701, 0.675291121006012, 0.6838781237602234, 0.7033378481864929, 0.6642679572105408, 0.6958175301551819, 0.7030391693115234, 0.7126079797744751, 0.7085608839988708, 0.7094844579696655, 0.7070907950401306, 0.7042253017425537, 0.6339894533157349, 0.6654672622680664, 0.7268171906471252, 0.7046075463294983, 0.7399986386299133, 0.7113198041915894, 0.7261976599693298, 0.7240222692489624, 0.7047412395477295, 0.7194429039955139, 0.7122161388397217, 0.7225752472877502, 0.7415407299995422, 0.7354838848114014, 0.7489195466041565, 0.7568303942680359, 0.7060269713401794, 0.7003573775291443, 0.7312019467353821, 0.7378730177879333, 0.7278789281845093, 0.71335369348526, 0.7211703658103943, 0.9999998807907104, 0.741000235080719, 0.7057098150253296, 0.7010806798934937, 0.7290682196617126, 0.7109978795051575, 0.6865146160125732, 0.7172204256057739, 0.7004615664482117, 0.7383097410202026, 0.7162948250770569, 0.7133790254592896, 0.7

In [14]:
similarities_df = pl.DataFrame(similarities)

import altair as alt

alt.Chart(similarities_df).mark_rect().encode(
    x=alt.X("speaker_2:N", sort=None),
    y=alt.Y("speaker_1:N", sort=None),
    color=alt.Color("similarity:Q").scale(scheme="yelloworangebrown"),
    tooltip=["speaker_1:N", "speaker_2:N", "similarity:Q"],
).properties(title="Messenger speeches similarity, sorted by approximate year")