In [1]:
from google.cloud import texttospeech
import pandas as pd
import os
import time
# for guid generation
import uuid



In [2]:
data_dir = 'data'

save_dir = 'data/audio'
df_path = 'data/dataframe.csv'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

if not os.path.exists(df_path):
    df = pd.DataFrame(columns=['text_hash', 'text', 'audio_file', 'synthesis_time', 'voice_name', 'speaking_rate', 'pitch'])
else:
    df = pd.read_csv(df_path)

In [3]:
VOICES = {
    'it': {
        'male': 'it-IT-Wavenet-D',
        'female': 'it-IT-Wavenet-C',
    },
    'en': {
        'male': 'en-US-Wavenet-D',
        'female': 'en-US-Wavenet-C',
    },
}

class TextToSpeech:

    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self.voice = texttospeech.VoiceSelectionParams(language_code="it-IT")
        self.audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

    def synthesize(self, text, speaking_rate=0.75, voice_name="it-IT-Wavenet-D", pitch=0):
        self.voice.name = voice_name
        self.audio_config.speaking_rate = speaking_rate
        self.audio_config.pitch = pitch
        synthesis_input = texttospeech.SynthesisInput(text=text)

        # hash text
        guid = uuid.uuid5(uuid.NAMESPACE_DNS, str(hash(f"{text}{voice_name}{speaking_rate:.2f}{pitch}")))

        audio_file = f'{save_dir}/{guid}.mp3'

        # check if text already exists
        if df[(df['text'] == text) & (df['speaking_rate'] == speaking_rate)].shape[0] > 0:
            s_df = df[df['text'] == text]
            assert(s_df.shape[0] == 1)
            row = s_df.iloc[0]
            assert(row['guid'] == guid)
            assert(row['audio_file'] == audio_file)

        else:
            print("synthesizing...")
            # Synthesize speech
            t0 = time.perf_counter()
            response = self.client.synthesize_speech(input=synthesis_input, voice=self.voice, audio_config=self.audio_config)
            t1 = time.perf_counter()
            synthesis_time = t1 - t0

            # save audio
            with open(audio_file, "wb") as out:
                out.write(response.audio_content) 

            # store in dataframe
            df.loc[df.shape[0]] = [guid, text, audio_file, synthesis_time, voice_name, speaking_rate, pitch]
            row = df.iloc[-1]

            # TODO: put this in destructor
            df.to_csv(df_path, index=False)

        return dict(row)

In [4]:
tts = TextToSpeech()

In [5]:
phrase = 'Ciao, come stai?'
ret = tts.synthesize(phrase, speaking_rate=0.5)
ret = tts.synthesize(phrase, speaking_rate=0.75)

synthesizing...
synthesizing...


In [6]:
phrase = 'Quello laggiù'
ret = tts.synthesize(phrase, speaking_rate=0.5)
ret = tts.synthesize(phrase, speaking_rate=0.75)

synthesizing...
synthesizing...


In [9]:
phrase = 'vorrei un tavolo per due vicino alla finestra'
# ret = tts.synthesize(phrase, speaking_rate=0.5)
# ret = tts.synthesize(phrase, speaking_rate=0.75)
ret = tts.synthesize(phrase, speaking_rate=0.7)

synthesizing...


In [10]:
df

Unnamed: 0,text_hash,text,audio_file,synthesis_time,voice_name,speaking_rate,pitch
0,3b078a08-8201-5eee-a5df-b19d0b4aebe1,"Ciao, come stai?",data/audio/3b078a08-8201-5eee-a5df-b19d0b4aebe...,0.50014,it-IT-Wavenet-D,0.5,0
1,65ab337b-94fb-53b2-8309-597130861e84,"Ciao, come stai?",data/audio/65ab337b-94fb-53b2-8309-597130861e8...,0.297484,it-IT-Wavenet-D,0.75,0
2,8483293a-b41c-5a43-b594-5113e626fc0a,Quello laggiù,data/audio/8483293a-b41c-5a43-b594-5113e626fc0...,0.195614,it-IT-Wavenet-D,0.5,0
3,f8de8966-2730-5b0d-9e70-744d0c84ebee,Quello laggiù,data/audio/f8de8966-2730-5b0d-9e70-744d0c84ebe...,0.390596,it-IT-Wavenet-D,0.75,0
4,67e7fafa-f2c8-5a94-be40-17c51baf648a,vorrei un tavolo per due vicino alla finestra,data/audio/67e7fafa-f2c8-5a94-be40-17c51baf648...,0.210947,it-IT-Wavenet-D,0.5,0
5,392634f3-e8a9-5570-b37b-6f984cb7015e,vorrei un tavolo per due vicino alla finestra,data/audio/392634f3-e8a9-5570-b37b-6f984cb7015...,0.255181,it-IT-Wavenet-D,0.75,0
6,69b1616c-ae79-5a21-a4a1-abd1bacc1aed,vorrei un tavolo per due vicino alla finestra,data/audio/69b1616c-ae79-5a21-a4a1-abd1bacc1ae...,0.350979,it-IT-Wavenet-D,0.7,0
