In [None]:
import base64
import requests

In [None]:
API_KEY = open("/Users/stevie/repos/lingo_kit_data/utils/google_cloud_api_key.txt").read().strip()

In [None]:
# get list of voices
voices = requests.get(
    f"https://texttospeech.googleapis.com/v1/voices?key={API_KEY}"
).json()
print(voices)

In [None]:
it_voices = [v for v in voices["voices"] if "it-IT" in v["languageCodes"]]
en_voices = [v for v in voices["voices"] if "en-US" in v["languageCodes"]]
len(it_voices), len(en_voices)

In [None]:
neural_it_voices = [v for v in it_voices if "Neural2" in v["name"]]
neural_en_voices = [v for v in en_voices if "Neural2" in v["name"]]
# neural_it_voices = [v for v in it_voices if "Chirp" in v["name"]]
# neural_en_voices = [v for v in en_voices if "Chirp" in v["name"]]
len(neural_it_voices), len(neural_en_voices)

In [None]:
neural_en_voices

In [None]:
neural_it_voices

In [None]:
endpoint = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={API_KEY}"
VOICES = {
    'english': {
        'male': 'en-US-Neural2-D',
        'female': 'en-US-Neural2-C',
    },
    'italian': {
        'male': 'it-IT-Neural2-F',
        'female': 'it-IT-Neural2-A',
    }
}

def ssml_single_word(word, rate, pitch, pause_ms, slash_pause_ms):
    # Add a period to encourage natural sentence prosody
    safe = word.strip()
    safe = safe.replace("/", f'<break time="{slash_pause_ms}ms"/>')
    if safe[-1] not in ".!?":
        safe += "."
    speach_ssml = f"""
        <speak>
            <break time="{pause_ms}ms"/>
            <prosody rate="{rate}" pitch="{pitch}">
                <p><s>{safe}</s></p>
            </prosody>
            <break time="{pause_ms}ms"/>
        </speak>
    """.strip()
    return speach_ssml

def synthesize_word(word, voice_name, lang, speaking_rate, outfile):
    # right now, let's only support the following settings
    pitch = None
    pause_ms = 120
    assert(lang in ['en-US', 'it-IT'])
    # if lang == 'en-US':
        # assert(voice_name in ["en-US-Neural2-A", "en-US-Neural2-C", 'en-US-Neural2-D'])
        # assert(speaking_rate == 0.92)
    # elif lang == 'it-IT':
        # assert(voice_name in ["it-IT-Neural2-A", "it-IT-Neural2-E"])
        # assert(speaking_rate == 0.75)

    ssml = ssml_single_word(word, rate=speaking_rate, pitch="-1st", pause_ms=pause_ms, slash_pause_ms=500)

    # Optional: also set global audioConfig tweaks (mild adjustments)
    audio_cfg = {"audioEncoding": "MP3"}
    if speaking_rate is not None:
        audio_cfg["speakingRate"] = speaking_rate
    if pitch is not None:
        audio_cfg["pitch"] = pitch

    payload = {
        "input": {"ssml": ssml},
        "voice": {"languageCode": lang, "name": voice_name},
        "audioConfig": audio_cfg
    }

    r = requests.post(endpoint, json=payload, timeout=30)
    r.raise_for_status()
    audio_b64 = r.json()["audioContent"]
    with open(outfile, "wb") as f:
        f.write(base64.b64decode(audio_b64))
    return outfile

In [None]:
import os
english_speaking_rate = 0.92
italian_speaking_rate = 0.7

VOICES = {
    'english': {
        'male': 'en-US-Neural2-D',
        'female': 'en-US-Neural2-C',
    },
    'italian': {
        'male': 'it-IT-Neural2-F',
        'female': 'it-IT-Neural2-A',
    }
}

english_voices = [
    # "en-US-Neural2-A", # NERRRD
    "en-US-Neural2-C",
    "en-US-Neural2-D",
    "en-US-Neural2-E",
]

italian_voice = [
    "it-IT-Neural2-A",
    "it-IT-Neural2-E",
    "it-IT-Neural2-F",
] 

english_terms = [
    'the',
    'a', 
    'and',
    'or',
    'I',
    'you',
    'you (plural)',
    'me',
    'if',
    'nice (plural) (feminine)',
    'where is the nearest bathroom?',
    'I am going to the bank with my friend',
    'is it big? / is it large?',
    'it is big / it is large',
]

italian_terms = [
    'il',
    'la',
    'e',
    'o',
    'io',
    'tu',
    'me',
    'un',
    'se',
    'dove è il bagno più vicino?',
    'io vado in banca con il mio amico',
    'è grande? / è grosso?',
    'è grande / è grosso',
]

output_dir = "tts_samples"
for lang, voices, terms, speaking_rate in [
    ("en-US", english_voices, english_terms, english_speaking_rate), 
    ("it-IT", italian_voice, italian_terms, italian_speaking_rate)
]:
    for voice in voices:
        for term in terms:
            outfile = f"tts_samples/{lang}/{voice}/speaking_rate={speaking_rate:.2f}/{term.replace('/', 'SLASH')}.mp3"
            # if os.path.exists(outfile):
            #     print(f"Skipping {outfile}")
            #     continue
            os.makedirs(os.path.dirname(outfile), exist_ok=True)
            print(f"Generating {outfile}")
            synthesize_word(term, outfile=outfile, lang=lang, voice_name=voice, speaking_rate=english_speaking_rate)
            break
        break
    break