In [1]:
import argparse
import json
import shutil
import subprocess
import sys
import time
from typing import Iterator
import os
import requests
import audioop
from pydub import AudioSegment
from types import SimpleNamespace


In [2]:

def convert_wav_chunk_to_ulaw_chunk(wav_chunk, sample_width=2): 
    # The sample_width parameter corresponds to the number of bytes used per sample, default is 2 for 16-bit audio
    
    if sample_width not in {1, 2, 4}:
        raise ValueError("sample_width must be 1, 2, or 4")
        
    # Convert the WAV audio chunk to u-Law encoding 
    try:
        ulaw_chunk = audioop.lin2ulaw(wav_chunk, sample_width) 
    except audioop.error as e:
        print(f"Error converting WAV chunk to u-Law: {e}")
        return None
    
    return ulaw_chunk


In [3]:

def is_installed(lib_name: str) -> bool:
    lib = shutil.which(lib_name)
    if lib is None:
        return False
    return True


In [4]:

def save(audio: bytes, filename: str) -> None:
    with open(filename, "wb") as f:
        f.write(audio)


In [5]:

def stream_ffplay(audio_stream, output_file, save=True):
    if not save:
        ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"]
    else:
        print("Saving to ", output_file)
        # ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", '-ar', '8000', output_file]
        ffplay_cmd = ["ffmpeg", "-probesize", "1024",'-c:a', 'pcm_mulaw', '-ar', '8000', output_file, "-i", '-']

    ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
    for chunk in audio_stream:
        if chunk is not None:
            ffplay_proc.stdin.write(chunk)

    # close on finish
    ffplay_proc.stdin.close()
    ffplay_proc.wait()


In [72]:

def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
    start = time.perf_counter()
    speaker["text"] = text
    speaker["language"] = language
    speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    speaker["add_wav_header"] = False
    res = requests.post(
        f"{server_url}/tts_stream/ulaw",
        json=speaker,   
        stream=True,
    )
    end = time.perf_counter()
    print(f"Time to make POST: {end-start}s", file=sys.stderr)

    if res.status_code != 200:
        print("Error:", res.text)
        sys.exit(1)

    first = True
    for chunk in res.iter_content(chunk_size=512):
        if first:
            end = time.perf_counter()
            print(f"Time to first chunk: {end-start}s", file=sys.stderr)
            first = False
        if chunk:
            yield chunk

    print("⏱️ response.elapsed:", res.elapsed)



In [59]:

def get_speaker(ref_audio,server_url):
    files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
    response = requests.post(f"{server_url}/clone_speaker", files=files)
    return response.json()


In [68]:
args = SimpleNamespace()
server_url = os.getenv("SERVER_URL", "http://15.237.142.70:8000")
text = "Mon nom est Yoann, et je pense que c'est vraiment sympa de manger des crêpes, vous ne trouvez pas ?"
language = 'fr'
ref_file = None
stream_chunk_size = 30
file_counter = 0
output_file = "./test_outputs/output_french"
while os.path.isfile(output_file+str(file_counter)+'.wav'):
    file_counter += 1
output_file = output_file+str(file_counter)+'.wav'

args.text = text
args.language = language
args.output_file = output_file
args.ref_file = ref_file
args.server_url = server_url
args.stream_chunk_size = stream_chunk_size


speaker_file_path = os.path.join(os.path.curdir,"french_speaker3.json")

with open(speaker_file_path, "r") as file:
    speaker = json.load(file)

if args.ref_file is not None:
    print("Computing the latents for a new reference...")
    speaker = get_speaker(args.ref_file, args.server_url)
with open('speaker.json','w') as f:
    json.dump(speaker,f)

In [83]:
audio_stream = tts(
        args.text,
        speaker,
        args.language,
        args.server_url,
        args.stream_chunk_size
    )
# stream_ffplay(audio_stream,args.output_file,bool(args.output_file))

def ulaw_to_segment(segment: bytes) -> AudioSegment:
    audio_data = audioop.ulaw2lin(segment, 2)
    audio_data = audioop.ratecv(audio_data, 2, 1, 8000, 16000, None)[0]
    return AudioSegment(audio_data, frame_rate=16000, channels=1, sample_width=2)

# Try to do a header
# with open('test.wav', 'wb') as wav_buff:
#     header = b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8data\x00\x00\x00\x00"
#     wav_buff.write(b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8data\x00\x00\x00\x00")
#     # wav_buff.write(b"RIFF\x00\x00\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x02\x00\x00\x00fact\x04\x00\x00\x00\x16\xe8\x00\x00LIST\x1a\x00\x00\x00INFOISFT\r\x00\x00\x00Lavf61.4.100\x00\x00data\x00\x00\x00\x00")
#     datalen = 0
#     for i,chunk in enumerate(audio_stream):
#         datalen+= len(chunk)
#         wav_buff.write(chunk)
#     wav_buff.seek(4)
#     wav_buff.write(bytes(len(header)+datalen-4))
#     wav_buff.seek(len(header)-4)
#     wav_buff.write(bytes(datalen))
# wav_buff.close()


# Si la conversion est faite à la fin il n'y a pas de problème
for i,chunk in enumerate(audio_stream):
    print('new chunk')
    print(chunk)
#     if i == 0:
#         wave_data = AudioSegment(chunk, frame_rate=24000,channels=1,sample_width=2)
#     else:
#         wave_data += AudioSegment(chunk, frame_rate=24000,channels=1,sample_width=2)
# AudioSegment.from_file(wave_data.export(out_f='test.wav',format="wav",codec='pcm_mulaw',parameters=["-ar","8000"]))


Time to make POST: 0.13507349998690188s


ChunkedEncodingError: Response ended prematurely

In [82]:
print(len(bytes(b'RIFF\x00,\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x00\x00fact\x04\x00\x00\x00\xab+\x00\x00LIST\x1a\x00\x00\x00INFOISFT\r\x00\x00\x00Lavf61.1.100\x00\x00data\xab+')))

90


In [50]:
f = AudioSegment.from_file("test_outputs/output_french.wav").export(format="wav",codec='pcm_mulaw',parameters=["-ar","8000"])
d = AudioSegment.from_file(f)
f.seek(0)
data = f.readlines()[0]
print(data)
print(d.raw_data)
print(len(d.raw_data))

print(len(data))
print(data[len(data)-len(d.raw_data)])




b"RIFF\xfe\x08\x00\x00WAVEfmt \x12\x00\x00\x00\x07\x00\x01\x00@\x1f\x00\x00@\x1f\x00\x00\x01\x00\x08\x00\x00\x00fact\x04\x00\x00\x00\xaa\x08\x00\x00LIST\x1a\x00\x00\x00INFOISFT\r\x00\x00\x00Lavf61.4.100\x00\x00data\xaa\x08\x00\x00\xc1\xdb\xb4\xae\xa9\xb2\xe7\xdb\xc9\xba\xcdMIIg\xe7[\xff[g[MA882-/)%%\x1c\xcd\xb8M>S\xb0\xaf\xb0\xba\xdb\xb0\xae\xb2\xb2\xc5\xbe\xb8\xc5\xc1\xd3\xdb\xe7[[g\xe7[Sg\xe7\xe7I>E>66/-&\x1f8\xc5A/+0\xa6\xa3\xc1Sg\xaf\xa4\xad\xbc\xdb\xbe\xae\xb0\xc5\xc9\xd3\xcd\xc5\xd3\xff[[[g\xdb\xe7\xffMIIMS4//(IS6,)/%[\xa4\xbc\xd3\xcd\xc5\xa6\xa7\xb6\xc1\xbe\xae\xac\xb4\xc1\xd3\xbe\xbe\xd3[IMSA<M[[EI\xe7[<:A<80/(*'\x1f\x1a\xc1\x9f\xbe\xe7I\xb4\x9e\xa5\xb0\xd3\xb8\xa6\xac\xb0\xbe\xd3\xba\xc1\xcd[EME>AS[AI[[M:8620/--%(!\x1d\xa5\xaa\xba[:\xa5\xa2\xa8\xb0S\xaf\xa7\xab\xb2\xdb\xbe\xc1\xbe\xcd<MEE<A\xe7MM[g\xe7IE>>:42./,)\x1e0\xa3\xb0\xb0A\xff\x9e\xa4\xa7\xbe\xc1\xae\xaa\xa9\xc1\xe7\xc1\xc5\xc5[IESM<[\xe7SMM\xdbIME0>/0/*.#(\x1e\xc9\xa8\xb8\xdb[\xb2\xa7\xa4\xad\xc9\xb4\xac\xaa\xb0\xc1\x