In [14]:
from google.cloud import texttospeech
import pandas as pd
import os
import time

In [None]:
data_dir = 'data'

save_dir = 'data/audio'
df_path = 'data/dataframe.csv'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

if not os.path.exists(df_path):
    df = pd.DataFrame(columns=['text_hash', 'text', 'audio_file', 'synthesis_time', 'voice_name', 'speaking_rate', 'pitch'])
else:
    df = pd.read_csv(df_path)

In [None]:
VOICES = {
    'it': {
        'male': 'it-IT-Wavenet-D',
        'female': 'it-IT-Wavenet-C',
    },
    'en': {
        'male': 'en-US-Wavenet-D',
        'female': 'en-US-Wavenet-C',
    },
}

class TextToSpeech:

    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self.voice = texttospeech.VoiceSelectionParams(language_code="it-IT")
        self.audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

    def synthesize(self, text, speaking_rate=0.75, voice_name="it-IT-Wavenet-D", pitch=0):
        self.voice.name = voice_name
        self.audio_config.speaking_rate = speaking_rate
        self.audio_config.pitch = pitch
        synthesis_input = texttospeech.SynthesisInput(text=text)

        # hash text
        text_hash = hash(f"{text}{voice_name}{speaking_rate}{pitch}")

        audio_file = f'{save_dir}/{text_hash}.mp3'

        # check if text already exists
        if df[df['text'] == text].shape[0] > 0:
            s_df = df[df['text'] == text]
            assert(s_df.shape[0] == 1)
            row = s_df.iloc[0]
            assert(row['text_hash'] == text_hash)
            assert(row['audio_file'] == audio_file)

        else:
            print("synthesizing...")
            # Synthesize speech
            t0 = time.perf_counter()
            response = self.client.synthesize_speech(input=synthesis_input, voice=self.voice, audio_config=self.audio_config)
            t1 = time.perf_counter()
            synthesis_time = t1 - t0

            # save audio
            with open(audio_file, "wb") as out:
                out.write(response.audio_content) 

            # store in dataframe
            df.loc[df.shape[0]] = [text_hash, text, audio_file, synthesis_time, voice_name, speaking_rate, pitch]
            row = df.iloc[-1]

            # TODO: put this in destructor
            df.to_csv(df_path, index=False)

        return dict(row)

In [17]:
tts = TextToSpeech()

In [18]:
ret = tts.synthesize('Ciao, come stai?')

In [19]:
ret

{'text_hash': np.int64(884650908637062659),
 'text': 'Ciao, come stai?',
 'audio_file': 'data/audio/884650908637062659.mp3',
 'synthesis_time': np.float64(1.5108500929927686),
 'voice_name': 'it-IT-Wavenet-D',
 'speaking_rate': np.float64(0.75),
 'pitch': np.int64(0)}