In [None]:
%pip install scikit-learn matplotlib
%pip install ipywidgets
%pip install grecy 
%run -m grecy install grc_perseus_trf

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid, fd = os.forkpty()


Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.4/214.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected p

In [6]:
import polars as pl

homer_df = pl.read_parquet("./homer.parquet")

tragedy_df = pl.read_parquet("./greek-tragedy-by-line_with-gender.parquet")

by_epic_and_book = homer_df.group_by(
    pl.col("title"),
    pl.col("book_n")
).agg(pl.col("text")).with_columns(pl.col("text").list.join(" "))

by_play_and_speaker = tragedy_df.group_by(
    pl.col("dramatist"), 
    pl.col("title"), 
    pl.col("speaker")
).agg(pl.col("n"), pl.col("text")).with_columns(pl.col("text").list.join(" "))

In [7]:
import spacy

nlp = spacy.load('grc_perseus_trf')

In [None]:
STOPS = ["δέ", "τε", "ἀλλ", "ἀλλά", "οὔτε"]

homeric_lemmata = {"Iliad": {}, "Odyssey": {}}

for row in by_epic_and_book.iter_rows(named=True):
    text = row['text'].strip()

    doc = nlp(text)
    lemmata = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in STOPS]

    homeric_lemmata[row['title']][row['book_n']] = " ".join(lemmata)


  with torch.cuda.amp.autocast(self._mixed_precision):


In [22]:
for book, text in homeric_lemma['Iliad'].items():
    with open(f"./how_epic_is_it/Homer-Iliad/{book}.txt", "w+") as f:
        f.write(text)

for book, text in homeric_lemma['Odyssey'].items():
    with open(f"./how_epic_is_it/Homer-Odyssey/{book}.txt", "w+") as f:
        f.write(text)

In [24]:
tragic_lemmata = {}

for row in by_play_and_speaker.iter_rows(named=True):
    text = row['text'].strip()
    dramatist = row['dramatist']
    title = row['title']
    speaker = row['speaker']

    doc = nlp(text)
    lemmata = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in STOPS]

    if tragic_lemmata.get(dramatist) is None:
        tragic_lemmata[dramatist] = {}

    if tragic_lemmata[dramatist].get(title) is None:
        tragic_lemmata[dramatist][title] = {}

    tragic_lemmata[dramatist][title][speaker] = " ".join(lemmata)


  with torch.cuda.amp.autocast(self._mixed_precision):


In [26]:
from pathlib import Path

for dramatist, titles in tragic_lemmata.items():
    dramatist_dir = Path("./how_epic_is_it", dramatist)

    if not dramatist_dir.exists():
        dramatist_dir.mkdir()

    for title, speakers in titles.items():
        title_dir = Path(dramatist_dir, title)

        if not title_dir.exists():
            title_dir.mkdir()

        for speaker, text in speakers.items():
            speaker_file = Path(title_dir, f"{speaker}.txt")

            with open(speaker_file, "w+") as f:
                f.write(text)