In [1]:
## Use Azure TTS service to create 3 wav files - one for each input sentence
## Specify the:
##      1) short name of the voice type required
##      2) text sentences
##      3) Location to save audio files and the file names
##      4) Location to save the file to be used by STT module

## If all the conversiosn successfull, the final file for STT module created.
##      The file name is automatically created based on the wav file specified.

In [2]:
import requests, time
from xml.etree import ElementTree
from datetime import datetime
from random import randint
import json

In [3]:
class TextToSpeech(object):
    
    def __init__(self, subscription_key):
        self.subscription_key = subscription_key
        self.text = None   ##  the text to be converted to audio -- defaulting the value to None
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.access_token = None
        self.access_token_startTime = None
    
    #The TTS endpoint requires an access token. This method exchanges your
    #subscription key for an access token that is valid for ten minutes.
    def get_token(self):
        #fetch_token_url = "https://westus.api.cognitive.microsoft.com/sts/v1.0/issueToken"
        fetch_token_url = "https://westeurope.api.cognitive.microsoft.com/sts/v1.0/issueToken"
        headers = {
            'Ocp-Apim-Subscription-Key': self.subscription_key
        }
        response = requests.post(fetch_token_url, headers=headers)
        self.access_token = str(response.text)
        self.access_token_startTime = time.strftime("%Y%m%d-%H%M%S")
    
    def save_audio(self, inVoiceTypeShortName = None, _outWavFile = None):
        #### Inputs:
        ####       the ShortName of the voice type to be used.
        ####       the absolute path where to save the audio file.
        #### Returns:
        ####       the status code (should be 200 if all good).
        ####       the audio file name created.
        base_url = 'https://westeurope.tts.speech.microsoft.com/'
        path = 'cognitiveservices/v1'
        constructed_url = base_url + path
        headers = {
            'Authorization': 'Bearer ' + self.access_token,
            'Content-Type': 'application/ssml+xml',
            'X-Microsoft-OutputFormat': 'riff-16khz-16bit-mono-pcm', # Deepspeech needs 16kHz, mono, PCM encoded wav files
            'User-Agent': 'ThesisWorkSpeech1'
        }
        xml_body = ElementTree.Element('speak', version='1.0')
        xml_body.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-us')
        voice = ElementTree.SubElement(xml_body, 'voice')
        voice.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
        #voice.set('name', 'en-US-Guy24kRUS') # Short name for 'Microsoft Server Speech Text to Speech Voice (en-US, Guy24KRUS)'
        voice.set('name', inVoiceTypeShortName) # Short name for 'Microsoft Server Speech Text to Speech Voice
        voice.text = self.text
        body = ElementTree.tostring(xml_body)
        
        ## sometimes azure response = 503
        ##    https://docs.microsoft.com/en-us/rest/api/searchservice/http-status-codes
        ##    Solution: make API call up to 3 times with random delay
        for _ in range(3):
            response = requests.post(constructed_url, headers=headers, data=body)
            if response.status_code != 503:
                break
            sleep503Time = randint(5, 10)
            print(f"\nAure response = 503, so sleeping for {sleep503Time} seconds.\n")
            time.sleep(sleep503Time)
        
        # if the conversion is successful then status code will be 200.
        if response.status_code == 200:
            with open(_outWavFile, 'wb') as audio:
                audio.write(response.content)
                #print(f"\nSUCCESS: file created here: {_outWavFile}")
        #else:
        #    print(f"\nFAILED: response.status_code = {str(response.status_code)}")
        
        return response.status_code
    
    def get_voices_list(self):
        #base_url = 'https://westus.tts.speech.microsoft.com/'
        base_url = 'https://westeurope.tts.speech.microsoft.com/'
        path = 'cognitiveservices/voices/list'
        constructed_url = base_url + path
        headers = {
            'Authorization': 'Bearer ' + self.access_token,
        }
        response = requests.get(constructed_url, headers=headers)
        if response.status_code == 200:
            print(f"\nAvailable voices:\n{response.text}")
        else:
            print(f"\nStatus code: {str(response.status_code)}\nSomething went wrong. Check your subscription key and headers.\n")
        return response.text
    
    def set_text_to_convert(self, inText):
        if isinstance(inText, str):
            self.text = inText
            return True
        else:
            return False
    #

In [4]:
## create the object for the TTS processing
app = TextToSpeech('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')  ## key 1, RG=RGMasterThesis1, Resource=RGMasterThesis1
app.get_token()
#print(f"\nAccess Token = \n{app.access_token}")
print(f"\nAccess Token Start Time = {app.access_token_startTime}\n")


Access Token Start Time = 20200806-033559



In [5]:
## CHECKED THE TYPES OF VOICES THAT I CAN USE - as on 05.08.2020
## view the list of short names available
#response_info = app.get_voices_list()
#print(f"{response_info}")

#voice_types_list_ALL = json.loads(response_info)

#english_short_names_list = []
#for voice_info in voice_types_list_ALL:
#    if voice_info["Locale"].startswith(r"en-") and voice_info["VoiceType"] == r"Standard":
#        english_short_names_list.append(voice_info["ShortName"])
#print(f"{english_short_names_list}")


## short names of voice types - English i.e. Locale starts with "en-", AND
##                              Voicetype=Standard
#voiceTypeShortNameList = [
#    'en-AU-Catherine', 'en-AU-HayleyRUS', 'en-CA-HeatherRUS',
#    'en-CA-Linda', 'en-GB-George', 'en-GB-HazelRUS',
#    'en-GB-Susan', 'en-IE-Sean', 'en-IN-Heera',
#    'en-IN-PriyaRUS', 'en-IN-Ravi', 'en-US-AriaRUS',
#    'en-US-BenjaminRUS', 'en-US-GuyRUS', 'en-US-ZiraRUS']

In [6]:
## short names of voice types - English i.e. Locale starts with "en-", AND
##                              Voicetype=Standard
## as on 05.08.2020
##
voiceTypeShortNames = {
    "Catherine": "en-AU-Catherine",
    "Hayley": "en-AU-HayleyRUS",
    "Heather": "en-CA-HeatherRUS",
    "Linda": "en-CA-Linda",
    "George": "en-GB-George",
    "Hazel": "en-GB-HazelRUS",
    "Susan": "en-GB-Susan",
    "Sean": "en-IE-Sean",
    "Heera": "en-IN-Heera",
    "Priya": "en-IN-PriyaRUS",
    "Ravi": "en-IN-Ravi",
    "Aria": "en-US-AriaRUS",
    "Benjamin": "en-US-BenjaminRUS",
    "Guy": "en-US-GuyRUS",
    "Zira": "en-US-ZiraRUS"
}

In [7]:
voiceTypeShortName = voiceTypeShortNames["George"]

outWavFilesPath = r'/home/rohit/PyWDUbuntu/thesis/combined_execution/audio/wavs/'
outWavFilesNames = [
    r'st_12_AM_George_file1.wav',
    r'st_12_AM_George_file2.wav',
    r'st_12_AM_George_file3.wav'
]

textToConvert = [
    r'Make me a story about persons sitting at a table. They are playing cards.',
    r'I want a story about a car on the road. A child plays with a toy.',
    r'Generate a story about persons walking on the street. A truck is on the road.'
]

stt_module_file_save_location = r'/home/rohit/PyWDUbuntu/thesis/combined_execution/SttTranscribe'

In [8]:
saved_files = []
resp_codes = []
if outWavFilesPath[-1] != r'/':
    outWavFilesPath += r'/'
for idx, each_filename in enumerate(outWavFilesNames):
    print(f"\n\n----------------  Processing file {idx+1}  ----------------")
    if not app.set_text_to_convert(textToConvert[idx]):
        print(f"ERROR setting text")
        resp_codes.append(-1)
        break
    else:
        outFile = outWavFilesPath + outWavFilesNames[idx]
        appSaveAudioRespCode = app.save_audio( voiceTypeShortName, outFile )
        if appSaveAudioRespCode != 200:
            print(f"\n\nERROR audio conversion. Azure response Status Code = {appSaveAudioRespCode}\n\n")
            resp_codes.append(appSaveAudioRespCode)
            break
        else:
            print(f"\nSUCCESS: created: {outFile}\nfor text:\n{textToConvert[idx]}")
            saved_files.append(outFile)
            resp_codes.append(appSaveAudioRespCode)

if all(each_code == 200 for each_code in resp_codes):
    file_for_stt_input = r'/'.join([
        stt_module_file_save_location,
        outWavFilesNames[0][:-10] + ".txt"
    ])
    with open(file_for_stt_input, "w") as sttfile:
        sttfile.write("\n".join(saved_files))
        sttfile.write("\n")
    print(f"\n\n\nFile for STT module created: {file_for_stt_input}")



----------------  Processing file 1  ----------------

SUCCESS: created: /home/rohit/PyWDUbuntu/thesis/combined_execution/audio/wavs/st_12_AM_George_file1.wav
for text:
Make me a story about persons sitting at a table. They are playing cards.


----------------  Processing file 2  ----------------

SUCCESS: created: /home/rohit/PyWDUbuntu/thesis/combined_execution/audio/wavs/st_12_AM_George_file2.wav
for text:
I want a story about a car on the road. A child plays with a toy.


----------------  Processing file 3  ----------------

SUCCESS: created: /home/rohit/PyWDUbuntu/thesis/combined_execution/audio/wavs/st_12_AM_George_file3.wav
for text:
Generate a story about persons walking on the street. A truck is on the road.



File for STT module created: /home/rohit/PyWDUbuntu/thesis/combined_execution/SttTranscribe/st_12_AM_George.txt
