In [70]:
import argparse
import json
import shutil
import subprocess
import sys
import time
from typing import Iterator
import os
import requests
import audioop
from pydub import AudioSegment
from types import SimpleNamespace
import io
import torch
import numpy as np

In [71]:
import base64

def to_bytes(b64: str) -> bytes:
    return base64.b64decode(b64)

def from_bytes(byt: bytes) -> str:
    return base64.b64encode(byt).decode()

In [72]:

def convert_wav_chunk_to_ulaw_chunk(wav_chunk, sample_width=2): 
    # The sample_width parameter corresponds to the number of bytes used per sample, default is 2 for 16-bit audio
    
    if sample_width not in {1, 2, 4}:
        raise ValueError("sample_width must be 1, 2, or 4")
        
    # Convert the WAV audio chunk to u-Law encoding 
    try:
        ulaw_chunk = audioop.lin2ulaw(wav_chunk, sample_width) 
    except audioop.error as e:
        print(f"Error converting WAV chunk to u-Law: {e}")
        return None
    
    return ulaw_chunk


In [73]:

def is_installed(lib_name: str) -> bool:
    lib = shutil.which(lib_name)
    if lib is None:
        return False
    return True


In [74]:

def save(audio: bytes, filename: str) -> None:
    with open(filename, "wb") as f:
        f.write(audio)


In [75]:

def stream_ffplay(audio_stream, output_file, save=True):
    if not save:
        ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"]
    else:
        print("Saving to ", output_file)
        # ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", '-ar', '8000', output_file]
        ffplay_cmd = ["ffmpeg", "-probesize", "1024",'-c:a', 'pcm_mulaw', '-ar', '8000', output_file, "-i", '-']

    ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
    for chunk in audio_stream:
        if chunk is not None:
            ffplay_proc.stdin.write(chunk)

    # close on finish
    ffplay_proc.stdin.close()
    ffplay_proc.wait()


In [89]:

def tts(text, voice_id, language, server_url, stream_chunk_size) -> Iterator[bytes]:
    start = time.perf_counter()
    payload = {}
    payload["text"] = text
    payload["voice_id"] = voice_id
    payload["language"] = language
    payload["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    payload["add_wav_header"] = False
    res = requests.post(
        f"{server_url}/tts_stream",
        json=payload,   
        stream=True,
    )
    end = time.perf_counter()
    print(f"Time to make POST: {end-start}s", file=sys.stderr)

    if res.status_code != 200:
        print("Error:", res.text)
        sys.exit(1)

    first = True
    for chunk in res.iter_content(chunk_size=stream_chunk_size):
        if first:
            end = time.perf_counter()
            print(f"Time to first chunk: {end-start}s", file=sys.stderr)
            first = False
        if chunk:
            yield chunk

    print("⏱️ response.elapsed:", res.elapsed)



In [90]:

def get_speaker(ref_audio,server_url):
    files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
    response = requests.post(f"{server_url}/clone_speaker", files=files)
    return response.json()


In [104]:
server_url = os.getenv("SERVER_URL", "http://13.38.113.161:8000")

text = "Mon nom est Yoann, et je pense que c'est vraiment sympa de manger des crêpes, vous ne trouvez pas ?"
language = 'fr'
ref_file = None
stream_chunk_size = 300
file_counter = 0
output_file = "./test_outputs/output_french"
while os.path.isfile(output_file+str(file_counter)+'.wav'):
    file_counter += 1
output_file = output_file+str(file_counter)+'.wav'
voice_id = 'french_speaker3'

In [105]:
audio_stream = tts(
        text=text,
        voice_id=voice_id,
        language=language,
        server_url=server_url,
        stream_chunk_size=stream_chunk_size
    )
# stream_ffplay(audio_stream,args.output_file,bool(args.output_file))

def ulaw_to_segment(segment: bytes) -> AudioSegment:
    audio_data = audioop.ulaw2lin(segment, 2)
    audio_data = audioop.ratecv(audio_data, 2, 1, 8000, 16000, None)[0]
    return AudioSegment(audio_data, frame_rate=16000, channels=1, sample_width=2)

# Try to do a header
# with open('test.wav', 'wb') as wav_buff:
#     header = b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8data\x00\x00\x00\x00"
#     wav_buff.write(b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8data\x00\x00\x00\x00")
#     # wav_buff.write(b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8\x00\x00LIST\x1a\x00\x00\x00INFOISFT\r\x00\x00\x00Lavf61.4.100\x00\x00data\x00\x00\x00\x00")
#     datalen = 0
#     for i,chunk in enumerate(audio_stream):
#         datalen+= len(chunk)
#         wav_buff.write(chunk)
#     wav_buff.seek(4)
#     wav_buff.write(bytes(len(header)+datalen-4))
#     wav_buff.seek(len(header)-4)
#     wav_buff.write(bytes(datalen))
# wav_buff.close()

def postprocess(wav):
    """Post process the output waveform"""
    if isinstance(wav, list):
        wav = torch.cat(wav, dim=0)
    wav = wav.clone().detach().cpu().numpy()
    wav = wav[None, : int(wav.shape[0])]
    wav = np.clip(wav, -1, 1)
    wav = (wav * 32767).astype(np.int16)
    return wav

chunks = []
for i,chunk in enumerate(audio_stream):
    print(chunk)
    chunks.append(chunk)
print(f"{len(chunks)} chunks")


# Si la conversion est faite à la fin il n'y a pas de problème

#     if i == 0:
#         wave_data = AudioSegment(chunk, frame_rate=24000,channels=1,sample_width=2)
#     else:
#         wave_data += AudioSegment(chunk, frame_rate=24000,channels=1,sample_width=2)
# AudioSegment.from_file(wave_data.export(out_f='test.wav',format="wav",codec='pcm_mulaw',parameters=["-ar","8000"]))


Time to make POST: 0.01607549993786961s


b'\x02\x00\xfa\xff\x00\x00\x02\x00\xfe\xff\x04\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\xff\xff\xff\xff\xff\xff\x00\x00\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x01\x00\x02\x00\x02\x00\x01\x00\x01\x00\x01\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xfe\xff\xfe\xff\xff\xff\xfe\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x

Time to first chunk: 3.393428400042467s


In [129]:
header = b'RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\xc0\x5d\x00\x00\x80\xbb\x00\x00\x02\x00\x10\x00data\x00\x00\x00\x01'
print(len(header))
chunks_header = []
for chunk in chunks:
    chunks_header.append(header+chunk)

44


In [130]:
def wav_to_ulaw(wav_file: io.BytesIO) -> bytes:
    segment = AudioSegment.from_wav(wav_file)
    segment = segment.set_frame_rate(8000)
    segment = segment.set_channels(1)
    segment = segment.set_sample_width(2)
    ulaw_audio = audioop.lin2ulaw(segment.raw_data, 2)
    return ulaw_audio


In [132]:
ulaw = []
for chunk in chunks_header:
    print(chunk)
    file = io.BytesIO(chunk)
    file.seek(0)
    print(file.read())
    segment = AudioSegment.from_file(file, format='wav')
    print(segment.raw_data)



b'RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\xc0]\x00\x00\x80\xbb\x00\x00\x02\x00\x10\x00data\x00\x00\x00\x01\x02\x00\xfa\xff\x00\x00\x02\x00\xfe\xff\x04\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\xff\xff\xff\xff\xff\xff\x00\x00\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x01\x00\x02\x00\x02\x00\x01\x00\x01\x00\x01\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xfe\xff\xfe\xff\xff\xff\xfe\xff\

In [97]:
for ulaw_chunk in ulaw:
    print(ulaw_chunk)

b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''
b''


In [11]:
for i,chunk in enumerate(chunks):
    if i == 0:
        wav_segment = AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
    else:
        wav_segment += AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
print('Wav data reconstructed')
wav_segment

Wav data reconstructed


In [12]:
for i,chunk in enumerate(chunks):
    if i == 0:
        wav_segment = AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
    else:
        wav_segment += AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
chunk_segment_ulaw = AudioSegment.from_file(wav_segment.export(format="wav",codec='pcm_mulaw',parameters=["-ar","8000"]))
print('Wav data reconsructed then converted to ulaw then reopened in audioSegment')
chunk_segment_ulaw

Wav data reconsructed then converted to ulaw then reopened in audioSegment


In [13]:
for i,chunk in enumerate(chunks):
    wav_segment = AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
    if i == 0:
        chunk_segment_ulaw = AudioSegment.from_file(wav_segment.export(format="wav",codec='pcm_mulaw',parameters=["-ar","8000"]))
    else:
        chunk_segment_ulaw += AudioSegment.from_file(wav_segment.export(format="wav",codec='pcm_mulaw',parameters=["-ar","8000"]))
print('Wav data converted to ulaw then reopened in audioSegment then reconstructed')
chunk_segment_ulaw

Wav data converted to ulaw then reopened in audioSegment then reconstructed


In [39]:
def wav_to_ulaw(wav_file: io.BytesIO) -> bytes:
    segment = AudioSegment.from_wav(wav_file)
    segment = segment.set_frame_rate(8000)
    segment = segment.set_channels(1)
    segment = segment.set_sample_width(2)
    ulaw_audio = audioop.lin2ulaw(segment.raw_data, 2)
    return ulaw_audio

for i,chunk in enumerate(chunks):
    if i == 0:
        wav_segment = AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
    else:
        wav_segment += AudioSegment(chunk,frame_rate=24000,channels=1,sample_width=2)
ulaw_bytes = wav_to_ulaw(wav_segment.export(format='wav'))
print('Wav data reconsructed then converted to ulaw with other method')
AudioSegment(ulaw_bytes, frame_rate=8000,channels=1,sample_width=2)


Wav data reconsructed then converted to ulaw with other method


In [19]:
ffmpeg = FFmpeg().option("y").input("pipe:0").output("output.wav")

buffer = io.BytesIO()
for i,chunk in enumerate(chunks):
    buffer.write(chunk)

@ffmpeg.on("progress")
def on_progress(progress: Progress):
    print(progress)

ffmpeg.execute(buffer)

FFmpegError: Error opening input files: Invalid data found when processing input