## Take a sample ParlAI transcript

In [None]:
transcript = """[TransformerGenerator_1]: hi i am from the city how are you ? i live in california . where are you from ?
   [TransformerGenerator_2]: i ' d love to visit cali . i ' ve never been there . i am 56 .
[TransformerGenerator_1]: i love california its beautiful ! i ' m in my mid 50 ' s . where do you live ?
   [TransformerGenerator_2]: i live on the west coast . i used too be pretty , but i am not anymore . you ?
[TransformerGenerator_1]: i live near san francisco . what do you do for fun ? i am an alcoholic but i love it !
   [TransformerGenerator_2]: i love to fantasize . i have dreamed of being the world and taking over . do you ?
[TransformerGenerator_1]: i do not ! i love the outdoors . i love being in the mountains ! what are you up to ?
   [TransformerGenerator_2]: i was a painter , now i work as a housemaid . you ever been to cali ?
[TransformerGenerator_1]: i have not , but my family would love to go . i grew on a very small farm .
   [TransformerGenerator_2]: that ' s nice . where did you grow the food ? do you have any hobbies ?"""

processed_transcripts = []
for e in transcript.split("\n"):
    # remove brackets, and computer generated names
    ee = (
        e.strip()
        .replace("[", "")
        .replace("]", "")
        .replace("TransformerGenerator", "Speaker")
        .strip()
    )
    processed_transcripts.append(ee)


## pyttsx3
- cannot synthesize a list of strings sequentially? - issue replicated with script, not a notebook issue
- cannot supply "complex" output dir - eg. nested within dir
- can only process list of two strings sequentially
- outputed audio files do not play? when generated from script or in sequence
- tested a variety of different transcript outputs 

In [None]:
pyttsx3==2.90

# additionally require libspeak for pyttsx3
!apt-get update -y
!apt-get install -y libespeak-dev

# aplay
!apt-get install alsa-utils -y

In [None]:
!apt update && apt install espeak libespeak1 -y

In [None]:
pip install pyttsx3
pip list | grep 'pytt'

In [None]:
import pyttsx3

engine = pyttsx3.init()


In [None]:
engine.save_to_file("Hello World", "test.mp3")
engine.runAndWait()


In [None]:
engine.say(processed_transcripts[0].split(":")[1].strip())
engine.runAndWait()


In [None]:
# potentially choose a random persona for each chat transcript?
voices = engine.getProperty("voices")
english_voices = [e for e in voices if any("en" in lang for lang in e.languages)]

# for voice in english_voices:
#     print("\nVoice:")
#     print("ID: %s" % voice.id)
#     print("Name: %s" % voice.name)
#     print("Age: %s" % voice.age)
#     print("Gender: %s" % voice.gender)
#     print("Languages Known: %s" % voice.languages)


In [None]:
engine.setProperty("rate", 130)  # slow it down
engine.setProperty("voice", english_voices[5].id)  # choose a voice

# engine.say("i love california its beautiful ! i ' m in my mid 50 ' s . where do you live ?")
# engine.runAndWait()

engine.save_to_file(
    "i love california its beautiful ! i ' m in my mid 50 ' s . where do you live ?",
    "test.mp3",
)
engine.runAndWait()


In [None]:
from pathlib import Path

output_dir = Path("/path/to/call")
output_dir.mkdir(exist_ok=True, parents=True) if output_dir.exists() == False else None


In [None]:
for idx, e in enumerate(processed_transcripts):
    # format text/save file
    text = e.split(": ")[1].lower()
    #     save_path = output_dir / f"{idx}_{e.split(': ')[0].lower()}.mp3"
    print(f"{idx}_{e.split(': ')[0].lower()}.mp3")

    # synthesize, save locally? cannot save to "complex" dir?
    f"{idx}_{e.split(': ')[0].lower()}.mp3"
#     engine.save_to_file(text, f"{idx}_{e.split(': ')[0].lower()}.mp3")
#     engine.runAndWait()


## Google TTS
- https://github.com/pndurette/gTTS
- Risk of IP banning? limits unclear https://github.com/pndurette/gTTS/discussions/293
- Customizable text pre-processors which can, for example, provide pronunciation corrections
- Better audio form, WRT pauses between sentences etc.
- Outputs play outside of main application!

In [None]:
import gtts
import IPython

tts = gtts.gTTS(
    processed_transcripts[0].split(":")[1].strip(), lang="en", tld="com.au", slow=True
)
tts.save("./output/synth_calls/gtts_sample.mp3")
IPython.display.Audio("./output/synth_calls/sample_transcript/consolidated_final.wav")


In [None]:
IPython.display.Audio("./output/synth_calls/sample_transcript/consolidated_final.wav")


In [None]:
import time
from pathlib import Path

import gtts

# store temp TTS utterance audio
output_dir = Path("./output/synth_calls/sample_transcript")
output_dir.mkdir(exist_ok=True, parents=True) if output_dir.exists() == False else None

# TTS for each utterance
for idx, utterance in enumerate(transcript.split("\n")):
    time.sleep(1)  # ip ban?
    speaker = "speaker_1" if idx % 2 == 0 else "speaker_2"
    formatted_utterance = utterance.split(":")[1].strip()
    output_file = output_dir / f"{speaker}_utterance_{idx}.mp3"
    tts = gtts.gTTS(formatted_utterance, lang="en", tld="com.au", slow=True)
    tts.save(output_file)


In [None]:
!pip install pydub
!pip list | grep 'pydub'

In [None]:
import librosa
import pandas as pd
from pydub import AudioSegment

audio_fragment_records = []
for file in list(output_dir.glob("./*.mp3")):
    # collate utterance audio files into raw samples
    y, s = librosa.load(str(file))  # FYI: assigns default sample rate
    audio_fragment_records.append(
        {"file": file.name, "sample_array": y, "sample_array_shape": y.shape[0]}
    )

audio_fragments = (
    pd.DataFrame(audio_fragment_records)
    # probably just the df index; but to be sure
    .assign(sequence_idx=lambda x: x.file.apply(lambda y: int(y.split("_")[-1][0])))
    .sort_values("sequence_idx")
    # speaker as channel
    .assign(channel=lambda x: x.sequence_idx.apply(lambda y: 1 if y % 2 == 0 else 2))
    .reset_index(drop=True)
)


In [None]:
import numpy as np
import soundfile
from pydub import AudioSegment

# pad channel 1/2 chunks to ensure for interleaving pattern
channel_1_segments = []
channel_2_segments = []
for idx, e in audio_fragments.iterrows():

    if e.channel == 1:
        channel_1_segments.append(e.sample_array)
        # pad alternating channel (channel 2) with equivalent size zero array to create interleave
        channel_2_segments.append(np.zeros(e.sample_array.shape[0], dtype=np.float32))
    else:
        # odd indices are channel 2
        channel_2_segments.append(e.sample_array)
        # otherwise, channel 2 length zero array
        channel_1_segments.append(np.zeros(e.sample_array.shape[0], dtype=np.float32))

# temp save for channel 1/2 audio - saves as mono
default_sr = 22050
channel_1_padded = np.concatenate(channel_1_segments)
soundfile.write(output_dir / "channel_1_temp.wav", channel_1_padded, default_sr)

channel_2_padded = np.concatenate(channel_2_segments)
soundfile.write(output_dir / "channel_2_temp.wav", channel_2_padded, default_sr)

# consolidate into an interleaving, channel seperated source
left_channel = AudioSegment.from_wav(output_dir / "channel_1_temp.wav")
right_channel = AudioSegment.from_wav(output_dir / "channel_2_temp.wav")

stereo_sound = AudioSegment.from_mono_audiosegments(left_channel, right_channel)
stereo_sound.export(output_dir / "consolidated_final.wav")
