In [1]:
from google.cloud import texttospeech
import pandas as pd
import os
import time
# for guid generation
import uuid



In [2]:
data_dir = 'data'

save_dir = 'data/audio'
df_path = 'data/dataframe.csv'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

if not os.path.exists(df_path):
    df = pd.DataFrame(columns=['text_hash', 'text', 'audio_file', 'synthesis_time', 'voice_name', 'speaking_rate', 'pitch'])
else:
    df = pd.read_csv(df_path)

In [None]:
VOICES = {
    'it': {
        'male': 'it-IT-Wavenet-D',
        'female': 'it-IT-Wavenet-C',
    },
    'en': {
        'male': 'en-US-Wavenet-D',
        'female': 'en-US-Wavenet-C',
    },
}

class TextToSpeech:

    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self.voice = texttospeech.VoiceSelectionParams(language_code="it-IT")
        self.audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

    def synthesize(self, text, speaking_rate=0.75, voice_name="it-IT-Wavenet-D", pitch=0):
        self.voice.name = voice_name
        self.audio_config.speaking_rate = speaking_rate
        self.audio_config.pitch = pitch
        synthesis_input = texttospeech.SynthesisInput(text=text)

        # hash text
        guid = uuid.uuid5(uuid.NAMESPACE_DNS, str(hash(f"{text}{voice_name}{speaking_rate:.2f}{pitch}")))

        audio_file = f'{save_dir}/{guid}.mp3'

        # check if text already exists
        if df[(df['text'] == text) & (df['speaking_rate'] == speaking_rate)].shape[0] > 0:
            s_df = df[df['text'] == text]
            assert(s_df.shape[0] == 1)
            row = s_df.iloc[0]
            assert(row['guid'] == guid)
            assert(row['audio_file'] == audio_file)

        else:
            print("synthesizing...")
            # Synthesize speech
            t0 = time.perf_counter()
            # response = self.client.synthesize_speech(input=synthesis_input, voice=self.voice, audio_config=self.audio_config)
            t1 = time.perf_counter()
            synthesis_time = t1 - t0

            # save audio
            # with open(audio_file, "wb") as out:
            #     out.write(response.audio_content) 

            # store in dataframe
            df.loc[df.shape[0]] = [guid, text, audio_file, synthesis_time, voice_name, speaking_rate, pitch]
            row = df.iloc[-1]

            # TODO: put this in destructor
            df.to_csv(df_path, index=False)

        return dict(row)

In [4]:
tts = TextToSpeech()

In [5]:
phrase = 'Ciao, come stai?'
ret = tts.synthesize(phrase, speaking_rate=0.5)
ret = tts.synthesize(phrase, speaking_rate=0.75)

synthesizing...
synthesizing...


In [6]:
phrase = 'Quello laggiù'
ret = tts.synthesize(phrase, speaking_rate=0.5)
ret = tts.synthesize(phrase, speaking_rate=0.75)

synthesizing...
synthesizing...


In [7]:
phrase = 'vorrei un tavolo per due vicino alla finestra'
# ret = tts.synthesize(phrase, speaking_rate=0.5)
# ret = tts.synthesize(phrase, speaking_rate=0.75)
ret = tts.synthesize(phrase, speaking_rate=0.7)

synthesizing...


In [8]:
df

Unnamed: 0,text_hash,text,audio_file,synthesis_time,voice_name,speaking_rate,pitch
0,0d68b456-1c09-5d43-8f90-45c9c580fb28,"Ciao, come stai?",data/audio/0d68b456-1c09-5d43-8f90-45c9c580fb2...,0.596022,it-IT-Wavenet-D,0.5,0
1,b731eb20-72bf-5568-a733-4a8049306205,"Ciao, come stai?",data/audio/b731eb20-72bf-5568-a733-4a804930620...,0.242133,it-IT-Wavenet-D,0.75,0
2,5d433251-eff5-5c5d-a532-5c7f55ff81b8,Quello laggiù,data/audio/5d433251-eff5-5c5d-a532-5c7f55ff81b...,0.210975,it-IT-Wavenet-D,0.5,0
3,a185cefb-0cc3-5308-9a11-aeb150d4726f,Quello laggiù,data/audio/a185cefb-0cc3-5308-9a11-aeb150d4726...,0.241175,it-IT-Wavenet-D,0.75,0
4,e91debbd-78a4-590b-b396-3eb0f1f03260,vorrei un tavolo per due vicino alla finestra,data/audio/e91debbd-78a4-590b-b396-3eb0f1f0326...,0.22745,it-IT-Wavenet-D,0.7,0
