In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from sagemaker.drift_check_baselines import DriftCheckBaselines
from sagemaker.workflow.model_step import ModelStep
from sagemaker.s3 import S3Uploader, S3Downloader
from sagemaker.model_metrics import ModelMetrics
from sagemaker.inputs import CreateModelInput
from sagemaker import ModelPackage, Session
from botocore.exceptions import ClientError
from sagemaker.model import Model
from pathlib import Path

import sagemaker
import logging
import torch
import boto3
import json
import re
import os

2025-04-13 06:07:46.137887: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 06:07:46.368541: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 06:07:46.420689: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 06:07:46.438423: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-13 06:07:46.684433: I tensorflow/core/platform/cpu_feature_guar

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [None]:
# Initialize session and clients
sagemaker_session = Session()
role = sagemaker.get_execution_role()  # IAM role
region = boto3.Session().region_name
sm_client = boto3.client('sagemaker')
s3_client = boto3.client('s3')

In [None]:
model_bucket = 's3://fine-tuned-model-bert'

downloader = S3Downloader()

downloader.download(model_bucket, '.')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
model_2 = BertForSequenceClassification.from_pretrained('CustomModel')
model_2 = model_2.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
quote_bucket = 's3://book-text-info'
downloader = S3Downloader()
downloader.download(quote_bucket, 'book-text-info')

['book-text-info/The-Bible/The-Bible-book-audio-text.txt',
 'book-text-info/The-Bible/The-Bible-character-dialogue.txt',
 'book-text-info/alice-in-wonderland/alice-in-wonderland-book-audio-text-tagged.txt',
 'book-text-info/alice-in-wonderland/alice-in-wonderland-book-audio-text.txt',
 'book-text-info/alice-in-wonderland/alice-in-wonderland-character-dialogue.txt',
 'book-text-info/alice-in-wonderland/alice-in-wonderland-person-quotes.json',
 'book-text-info/chapter2/chapter2-book-audio-text-tagged.txt',
 'book-text-info/chapter2/chapter2-book-audio-text.txt',
 'book-text-info/chapter2/chapter2-character-dialogue.txt',
 'book-text-info/chapter4/chapter4-book-audio-text-tagged.txt',
 'book-text-info/chapter4/chapter4-book-audio-text.txt',
 'book-text-info/chapter4/chapter4-character-dialogue.txt',
 'book-text-info/chapter4/chapter4-person-quotes.json',
 'book-text-info/chapter4/working-voices.json',
 'book-text-info/example_book/example_audio_text.txt',
 'book-text-info/example_book/exa

In [7]:
book_path = Path('book-text-info/frankenstein')
dialogue = book_path.joinpath(f'{book_path.name}-character-dialogue.txt')
with open(dialogue) as f:
    lines = f.readlines()

In [8]:
person_quotes = {}
for line in lines:
    name_words = line.split(':')
    name = name_words[0]

    words = ':'.join(name_words[1:])
    words = words.split()
    quotes = []
    quote = []
    start = True
    for word in words:
        if '"' in word and start:
            start = False
            continue
        if '"' in word and not start:
            start = True
            quotes.append(' '.join(quote))
            quote = []
            continue
        quote.append(word.strip())

    if name in person_quotes:
        person_quotes[name].extend(quotes)
    else:
        person_quotes[name] = quotes

In [9]:
with open('int-mappings.json') as f:
    mappings = json.load(f)

In [10]:
for name, quotes in person_quotes.items():
    if len(quotes) == 0: continue
    inputs = tokenizer(quotes, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model_2(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = predictions.cpu().detach()
    res = torch.multinomial(predictions, 1).reshape(-1,).tolist()

    new_quotes = []
    for i, cls in enumerate(res):
        # TODO: Add in TTS model name here
        tone = mappings[str(cls)]
        new_quotes.append(f'<google:style name="{tone}">{quotes[i]}</google:style>')

    person_quotes[name] = new_quotes

In [11]:
book_quotes = book_path.joinpath(f'{book_path.name}-person-quotes.json')
with open(book_quotes, 'w') as f:
    json.dump(person_quotes, f)

In [12]:
quote_bucket = f's3://book-text-info/{book_path.name}'
uploader = S3Uploader()
uploader.upload(book_quotes, quote_bucket)

's3://book-text-info/frankenstein/frankenstein-person-quotes.json'