In [62]:
%pip install scikit-learn matplotlib
%pip install grecy 
%run -m grecy install grc_perseus_trf

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid, fd = os.forkpty()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/pletcher/code/writing/articles/2024-11-28_tragedy-dfs/.venv/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/pletcher/code/writing/articles/2024-11-28_tragedy-dfs/.venv/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

Installing grc_perseus_trf.....

Please wait, this could take some minutes.....

Collecting grc-perseus-trf==any
Downloading https://huggingface.co/Jacobo/grc_perseus_trf/resolve/main/grc_perseus_trf-any

In [5]:
import json

import polars as pl

df = pl.read_parquet("./greek-tragedy-by-line_with-gender.parquet")

In [15]:
with open("./messenger_speeches.json") as f:
    speeches = json.load(f)

messenger_pre_df = []

for dramatist, plays in speeches.items():
    for play in plays:
        total_lines = play['total_lines']
        total_messenger_lines = 0

        title = play['title']

        for line_pair in play['speeches']:
            total_messenger_lines += line_pair[1] - line_pair[0]

            for line in range(line_pair[0], line_pair[1] + 1):
                print(dramatist, title, line)

                ref = df.filter(
                    pl.col("dramatist") == dramatist, 
                    pl.col("title") == title,
                    pl.col("n") == str(line)
                )

                if not ref['text'].is_empty():
                    line_obj = dict(
                        dramatist = dramatist,
                        title=play['title'],
                        n=str(line),
                        speaker=ref.select("speaker").item(),
                        text=ref.select("text").item()
                    )

                messenger_pre_df.append(line_obj)

        # print(f"{play['title']}: messenger_lines: {total_messenger_lines}; total_lines: {total_lines}; pct_messenger_lines: {(total_messenger_lines / total_lines) * 100}")

messenger_df = pl.DataFrame(messenger_pre_df)

Aeschylus Agamemnon 503
Aeschylus Agamemnon 504
Aeschylus Agamemnon 505
Aeschylus Agamemnon 506
Aeschylus Agamemnon 507
Aeschylus Agamemnon 508
Aeschylus Agamemnon 509
Aeschylus Agamemnon 510
Aeschylus Agamemnon 511
Aeschylus Agamemnon 512
Aeschylus Agamemnon 513
Aeschylus Agamemnon 514
Aeschylus Agamemnon 515
Aeschylus Agamemnon 516
Aeschylus Agamemnon 517
Aeschylus Agamemnon 518
Aeschylus Agamemnon 519
Aeschylus Agamemnon 520
Aeschylus Agamemnon 521
Aeschylus Agamemnon 522
Aeschylus Agamemnon 523
Aeschylus Agamemnon 524
Aeschylus Agamemnon 525
Aeschylus Agamemnon 526
Aeschylus Agamemnon 527
Aeschylus Agamemnon 528
Aeschylus Agamemnon 529
Aeschylus Agamemnon 530
Aeschylus Agamemnon 531
Aeschylus Agamemnon 532
Aeschylus Agamemnon 533
Aeschylus Agamemnon 534
Aeschylus Agamemnon 535
Aeschylus Agamemnon 536
Aeschylus Agamemnon 537
Aeschylus Agamemnon 551
Aeschylus Agamemnon 552
Aeschylus Agamemnon 553
Aeschylus Agamemnon 554
Aeschylus Agamemnon 555
Aeschylus Agamemnon 556
Aeschylus Agamem

In [16]:
by_play_and_speaker = messenger_df.group_by(
    pl.col("dramatist"), 
    pl.col("title"), 
    pl.col("speaker")
).agg(pl.col("n"), pl.col("text"))

In [37]:
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
batch_size = 128
init = "nndsvda"


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
import spacy

nlp = spacy.load('grc_perseus_trf')

In [66]:
data = by_play_and_speaker.select(pl.col('text').list.join(" "))
raw_samples = data['text'].to_list()

data_samples = []

STOPS = ["δέ", "τε", "ἀλλ", "ἀλλά", "οὔτε"]

with nlp.select_pipes(enable="lemmatizer"):
    for row in raw_samples:
        doc = nlp(row)
        lemmata = [t.lemma_ for t in doc]

        data_samples.append(' '.join([t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in STOPS]))

In [None]:

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print(
    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=1,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)

# Fit the NMF model
print(
    "\n" * 2,
    "Fitting the NMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="kullback-leibler",
    solver="mu",
    max_iter=1000,
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in NMF model (generalized Kullback-Leibler divergence)",
)

# Fit the MiniBatchNMF model
print(
    "\n" * 2,
    "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
    "features, n_samples=%d and n_features=%d, batch_size=%d..."
    % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
    n_components=n_components,
    random_state=1,
    batch_size=batch_size,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    mbnmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in MiniBatchNMF model (Frobenius norm)",
)

# Fit the MiniBatchNMF model
print(
    "\n" * 2,
    "Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d, "
    "batch_size=%d..." % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
    n_components=n_components,
    random_state=1,
    batch_size=batch_size,
    init=init,
    beta_loss="kullback-leibler",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    mbnmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
)

print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")