In [59]:
import pandas as pd
import srsly
from pathlib import Path
import time
from uuid import uuid4
import gtts
import librosa
import numpy as np
import soundfile
from pydub import AudioSegment
import shutil

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_colwidth = 100

In [70]:
def format_episode(episode_raw):
    # as a dataframe
    episode = []
    for e in episode_raw['dialog']:
        episode.extend([{'id': ee['id'], 'text': ee['text']} for ee in e])

    return (pd.DataFrame(episode)
            .reset_index()
            .rename(mapper={'index': 'exchange_index'}, axis='columns')
            .assign(speaker=lambda x: x.id.apply(lambda y: int(y.split('_')[1])))
            )


def synthesize_tts_episode(episode_df, output_dir):
    # given an episode DF, synthesize audio for each utterance
    for idx, e in episode_df.iterrows():
        time.sleep(1)  # prevent IP banning?
        # format text/save file
        save_path = output_dir / f"{e.exchange_index}_speaker_{e.speaker}.mp3"

        # alternative voices, useful for debugging, could be improved with more variance
        if e.speaker == 1:
            tts = gtts.gTTS(e.text, lang='en', tld='com', slow=True)
        elif e.speaker == 2:
            tts = gtts.gTTS(e.text, lang='en', tld='ca', slow=True)

        tts.save(save_path)


def get_utterance_df(output_dir):
    # DF with utterance audio arrays + useful metadata
    audio_file_records = []
    for e in sorted(list(output_dir.rglob('./*.mp3'))):
        y, s = librosa.load(e.as_posix())  # new downsample rate
        audio_file_records.append(
            {'file': e.name, 'sample_array': y, 'sample_array_shape': y.shape[0]})

    return (pd.DataFrame(audio_file_records)
            .sort_values('file')
            .assign(channel=lambda x: x.file.apply(lambda y: int(y.split('_')[-1][0])))
            .reset_index(drop=True)
            )


def pad_and_collate_channel_audio(utterance_df, output_dir):
    # pad channel 1/2 chunks to ensure for interleaving pattern, isolate across seperate channels
    channel_1_segments = []
    channel_2_segments = []
    for idx, e in utterance_df.iterrows():
        if idx % 2 == 0:
            print(f"{e.file} is even")
            # even indices are channel 1; starting first, indexing from zero
            channel_1_segments.append(e.sample_array)  # the actual samples
            # pad alternating channel (channel 2) with equivalent size zero array
            channel_2_segments.append(
                np.zeros(e.sample_array.shape[0], dtype=np.float32))
        else:
            print(f"{e.file} is odd")
            # odd indices are channel 2
            channel_2_segments.append(e.sample_array)
            # otherwise, channel 2 length zero array
            channel_1_segments.append(
                np.zeros(e.sample_array.shape[0], dtype=np.float32))

    # temp save for channel 1/2 audio - saves as mono
    default_sr = 22050
    channel_1_padded = np.concatenate(channel_1_segments)
    soundfile.write(output_dir / 'channel_1_temp.wav',
                    channel_1_padded, default_sr)

    channel_2_padded = np.concatenate(channel_2_segments)
    soundfile.write(output_dir / 'channel_2_temp.wav',
                    channel_2_padded, default_sr)


def collate_channel_audio(output_dir):
    # consolidate into an interleaving, channel seperated source
    left_channel = AudioSegment.from_wav(output_dir / 'channel_1_temp.wav')
    right_channel = AudioSegment.from_wav(output_dir / 'channel_2_temp.wav')

    # load individual channels...
    stereo_sound = AudioSegment.from_mono_audiosegments(
        left_channel, right_channel)
    stereo_sound.export(output_dir / f"{output_dir.name}_final.wav")

In [61]:
base_output_dir = Path(
    '/home/samhardyhey/otso-rand/experiment_artefacts/stt_adjacent_processing/synthesis/')
# a collection of self-chat episodes
self_chat = list(srsly.read_jsonl(
    '/home/samhardyhey/otso-rand/experiment_artefacts/stt_data_synthesis/exp_a/TransformerGenerator_1_TransformerGenerator_2_selfchat.jsonl'))

In [71]:
# for episode in self_chat:
episode = self_chat[0]
# format episode
episode = format_episode(episode).head(8)

# create unique output dir
episode_id = uuid4().hex
output_dir = base_output_dir / episode_id
output_dir.mkdir(
    exist_ok=True, parents=True) if output_dir.exists() == False else None

# synthesize, save audio
synthesize_tts_episode(episode, output_dir)

# retrieve raw audio amplitude arrays
utterance_df = get_utterance_df(output_dir)



In [94]:
# collate/pad/save into channel-specific wav files
pad_and_collate_channel_audio(utterance_df, output_dir)

# consolidate into final file
collate_channel_audio(output_dir)

0_speaker_1.mp3 is even
1_speaker_2.mp3 is odd
2_speaker_1.mp3 is even
3_speaker_2.mp3 is odd
4_speaker_1.mp3 is even
5_speaker_2.mp3 is odd
6_speaker_1.mp3 is even
7_speaker_2.mp3 is odd


In [97]:
output_dir

PosixPath('/home/samhardyhey/otso-rand/experiment_artefacts/stt_adjacent_processing/synthesis/8c0bcaf407aa464d80c86d7fcfddb568')

In [30]:
from pydub import AudioSegment

In [37]:
channel_1 = AudioSegment.from_file('/home/samhardyhey/otso-rand/experiment_artefacts/stt_adjacent_processing/synthesis/28f812fb47584582b864b7655859ead1/channel_1_temp.wav')
channel_2 = AudioSegment.from_file('/home/samhardyhey/otso-rand/experiment_artefacts/stt_adjacent_processing/synthesis/28f812fb47584582b864b7655859ead1/channel_2_temp.wav')
merged = AudioSegment.from_file('/home/samhardyhey/otso-rand/experiment_artefacts/stt_adjacent_processing/synthesis/28f812fb47584582b864b7655859ead1/28f812fb47584582b864b7655859ead1_final.wav')

In [None]:
# put the final wavs in a single dir
episode_wav_dir = base_output_dir / 'episode_wavs'
episode_wav_dir.mkdir(exist_ok=True, parents=True) if episode_wav_dir.exists() == False else None

for e in list(base_output_dir.rglob('./*.wav')):
    if 'final' in e.as_posix():
        shutil.move(e.as_posix(), (episode_wav_dir / e.name).as_posix())

## Potentially experiment with different TLD for different voices

In [None]:
.google.com .google.ad .google.ae .google.com.af .google.com.ag .google.com.ai .google.al .google.am .google.co.ao .google.com.ar .google.as .google.at .google.com.au .google.az .google.ba .google.com.bd .google.be .google.bf .google.bg .google.com.bh .google.bi .google.bj .google.com.bn .google.com.bo .google.com.br .google.bs .google.bt .google.co.bw .google.by .google.com.bz .google.ca .google.cd .google.cf .google.cg .google.ch .google.ci .google.co.ck .google.cl .google.cm .google.cn .google.com.co .google.co.cr .google.com.cu .google.cv .google.com.cy .google.cz .google.de .google.dj .google.dk .google.dm .google.com.do .google.dz .google.com.ec .google.ee .google.com.eg .google.es .google.com.et .google.fi .google.com.fj .google.fm .google.fr .google.ga .google.ge .google.gg .google.com.gh .google.com.gi .google.gl .google.gm .google.gr .google.com.gt .google.gy .google.com.hk .google.hn .google.hr .google.ht .google.hu .google.co.id .google.ie .google.co.il .google.im .google.co.in .google.iq .google.is .google.it .google.je .google.com.jm .google.jo .google.co.jp .google.co.ke .google.com.kh .google.ki .google.kg .google.co.kr .google.com.kw .google.kz .google.la .google.com.lb .google.li .google.lk .google.co.ls .google.lt .google.lu .google.lv .google.com.ly .google.co.ma .google.md .google.me .google.mg .google.mk .google.ml .google.com.mm .google.mn .google.ms .google.com.mt .google.mu .google.mv .google.mw .google.com.mx .google.com.my .google.co.mz .google.com.na .google.com.ng .google.com.ni .google.ne .google.nl .google.no .google.com.np .google.nr .google.nu .google.co.nz .google.com.om .google.com.pa .google.com.pe .google.com.pg .google.com.ph .google.com.pk .google.pl .google.pn .google.com.pr .google.ps .google.pt .google.com.py .google.com.qa .google.ro .google.ru .google.rw .google.com.sa .google.com.sb .google.sc .google.se .google.com.sg .google.sh .google.si .google.sk .google.com.sl .google.sn .google.so .google.sm .google.sr .google.st .google.com.sv .google.td .google.tg .google.co.th .google.com.tj .google.tl .google.tm .google.tn .google.to .google.com.tr .google.tt .google.com.tw .google.co.tz .google.com.ua .google.co.ug .google.co.uk .google.com.uy .google.co.uz .google.com.vc .google.co.ve .google.vg .google.co.vi .google.com.vn .google.vu .google.ws .google.rs .google.co.za .google.co.zm .google.co.zw .google.cat

In [None]:
tts = gtts.gTTS(e.text, lang='en', tld='com', slow=True)

In [None]:
if e.speaker == 1:
    tts = gtts.gTTS(e.text, lang='en', tld='com', slow=True)
elif e.speaker == 2:
    tts = gtts.gTTS(e.text, lang='en', tld='ca', slow=True)

In [77]:
from gtts import gTTS
from io import BytesIO

tts = gtts.gTTS('hello world my name is Sam', lang='en', tld='com', slow=True)
tts.save('/home/samhardyhey/temp_audio.mp3')
AudioSegment.from_file('/home/samhardyhey/temp_audio.mp3')