In [25]:
# Copyright 2025 Raphael Yana
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()

ENV_VARS = {
    "GOOGLE_KEY": "Google Gemini",
}

for var, name in ENV_VARS.items():
    if not os.getenv(var):
        raise ValueError(f"Missing {name} API key: `{var}` must be set in the environment.")
    
genai.configure(api_key=os.getenv("GOOGLE_KEY"))

In [27]:
import pathlib
from pipeline import BookProcessingPipeline


books = [
    {
        "raw_txt": "data/republic.txt",
        "cleaned_txt": "data/cleaned_republic.txt",
        "book_name": "Republic",
        "target_speaker": "Socrates",
        "is_narrator": True
    },
    {
        "raw_txt": "data/symposium.txt",
        "cleaned_txt": "data/cleaned_symposium.txt",
        "book_name": "Symposium",
        "target_speaker": "Socrates",
        "is_narrator": False
    },
    {
        "raw_txt": "data/hamlet.txt",
        "cleaned_txt": "data/cleaned_hamlet.txt",
        "book_name": "Hamlet",
        "target_speaker": "Hamlet",
        "is_narrator": False
    }
]

for config in books:
    print(f"\n === Processing {config['book_name']}...")

    pipeline = BookProcessingPipeline(
        raw_txt_path=pathlib.Path(config["raw_txt"]),
        book_name=config["book_name"],
        target_speaker=config["target_speaker"],
        is_narrator=config["is_narrator"]
    )
    pipeline.run()


 === Processing Republic...
→ Filtering boundaries for Republic …

–– DETECTED TABLE OF CONTENTS ––

INTRODUCTION AND ANALYSIS.
 THE REPUBLIC.
 PERSONS OF THE DIALOGUE.
 BOOK I.
 BOOK II.
 BOOK III.
 BOOK IV.
 BOOK V.
 BOOK VI.
 BOOK VII.
 BOOK VIII.
 BOOK IX.
 BOOK X.

✓ LLM picked narrative start at byte 553683
Structure profile written → utils/struct_profile_republic.json
  level_0: 1 headings (showing 3) → ['INTRODUCTION AND ANALYSIS.']
  level_1: 12 headings (showing 3) → ['THE REPUBLIC.', 'PERSONS OF THE DIALOGUE.', 'BOOK I.']
Cleaned text saved to data/republic/cleaned.txt
→ Splitting into structured paragraphs …
Detected TOC file found, using it to build structure profile.
Saved 4359 structured paragraphs → data/republic/paragraphs.jsonl
→ Classifying paragraphs …
=== Avg words per paragraph: 27.0, raw chunk size: 74, bounded to: 50
→ Classifying chunk 1/88: paragraphs 0–49
→ Classifying chunk 2/88: paragraphs 50–99
→ Classifying chunk 3/88: paragraphs 100–149
→ Classifying ch