In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from sagemaker.drift_check_baselines import DriftCheckBaselines
from sagemaker.workflow.model_step import ModelStep
from sagemaker.s3 import S3Uploader, S3Downloader
from sagemaker.model_metrics import ModelMetrics
from sagemaker.inputs import CreateModelInput
from sagemaker import ModelPackage, Session
from botocore.exceptions import ClientError
from sagemaker.model import Model
from pathlib import Path

import sagemaker
import logging
import torch
import boto3
import json
import re
import os

2025-04-13 06:09:43.106313: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 06:09:43.120230: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 06:09:43.138844: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 06:09:43.144645: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-13 06:09:43.157792: I tensorflow/core/platform/cpu_feature_guar

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Initialize session and clients
sagemaker_session = Session()
role = sagemaker.get_execution_role()  # IAM role
region = boto3.Session().region_name
sm_client = boto3.client('sagemaker')
s3_client = boto3.client('s3')

In [3]:
book_path = Path('book-text-info/frankenstein')

In [4]:
book_text = book_path.joinpath(f'{book_path.name}-book-audio-text.txt')
if 'alice' in book_path.name or 'frank' in book_path.name:
    with open(book_text) as f:
        lines = f.read()
else:
    with open(book_text) as f:
        lines = f.readlines()

Clean the text if it is in paragraph form.

In [5]:
if isinstance(lines, str):
    new_lines = []

    old_lines = lines.split('\n')
    start = True
    one_liner = ''
    for line in old_lines:
        # group by paragraphs
        if len(line.strip()) == 0:
            new_lines.append(one_liner)
            one_liner = ''
            continue
        one_liner += line.strip() + ' '
    lines = new_lines

In [6]:
voice_bucket = 's3://sound-scribe-acessories/working-voices.json'
downloader = S3Downloader()
downloader.download(voice_bucket, '.')

['./working-voices.json']

In [7]:
quote_path = book_path.joinpath(f'{book_path.name}-person-quotes.json')
with open(quote_path) as f:
    person_quotes = json.load(f)

In [8]:
with open('working-voices.json') as f:
    voices = json.load(f)

narrator = "en-US-Polyglot-1"

voices = voices['voices']
voices.remove(narrator)

voice_name = {name: voices[i % len(voices)] for i, name in enumerate(person_quotes)}

voice_name

{'Unknown': 'en-US-Neural2-A',
 'I': 'en-US-Neural2-C',
 'Elizabeth': 'en-US-Neural2-D',
 'our': 'en-US-Neural2-E',
 'Clerval': 'en-US-Neural2-F',
 'the professor': 'en-US-Neural2-G',
 'M. Krempe': 'en-US-Neural2-H',
 'This professor': 'en-US-Neural2-I',
 'M. Waldman': 'en-US-Neural2-J',
 'Coleridge': 'en-US-Standard-A',
 'Ernest': 'en-US-Standard-B',
 'Justine': 'en-US-Standard-E',
 'Alas': 'en-US-Standard-J',
 'him': 'en-US-Studio-O',
 'Wordsworth': 'en-US-Studio-Q',
 'an old woman who was sleeping in a chair beside me': 'en-GB-Standard-A',
 'my enemy': 'en-GB-Standard-B',
 'Frankenstein': 'en-GB-Standard-C',
 'Project Gutenberg': 'en-GB-Standard-D',
 'PGLAF': 'en-GB-Standard-F'}

In [9]:
replace_counts = {name: 0 for name, quotes in person_quotes.items()}
new_lines = ['<speak>\n']
for line in lines:
    new_line = ''
    pattern = r"</[a-zA-Z ’]+>"
    # Find all the name tags
    res = re.findall(pattern, line)
    if len(res) == 0:
        new_lines.append(line)
        continue

    pattern = r"“[’a-zA-Z \[\],?!.\—:\(\)_”]*"
    words = re.findall(pattern, line)
    # print(line)
    # print(words)
    start = 0
    for r, w in zip(res, words):
        # Get the index of the start of </name>
        r_idx = line.index(r)
        # Add all characters up until that point
        new_line += line[start:r_idx]
        # Update start to point to the end of r
        start += len(r) + r_idx

        # Get the index of the quote
        w_idx = line.index(w)
        # Add all characters from the end of the </name> to the quote
        new_line += line[start:w_idx]

        # Extract the name from the </name>
        name = r[2:-1]
        # Get the index from the quote gathering dictionary
        idx = replace_counts[name]
        # Get a random voice
        tts_voice = voice_name[name]
        try:
            # Surround the quote in the proper voice tags
            voice_quote = f'<voice name="{tts_voice}">{person_quotes[name][idx]}</voice>'
            # Substitute the quote with the tagged quote
            s = re.sub(pattern, w, voice_quote)
            # Update the new_line to include this information
            new_line += s
        except Exception as e:
            print(e)

        # Update start to point at the end of the quote
        start += len(w) + w_idx
        # Update replace counts to make sure we get all the quotes
        replace_counts[name] += 1
    # Add the rest of the line to the new line
    new_line += line[start:]
    new_line += '\n'
    new_lines.append(new_line)
new_lines.append('</speak>')

In [10]:
tagged_path = book_path.joinpath(f'{book_path.name}-book-audio-text-tagged.txt')
with open(tagged_path, 'w') as f:
    f.writelines(new_lines)

In [11]:
quote_bucket = f's3://book-text-info/{book_path.name}'
uploader = S3Uploader()
uploader.upload(tagged_path, quote_bucket)

's3://book-text-info/frankenstein/frankenstein-book-audio-text-tagged.txt'