In [1]:
import argparse
import json
import shutil
import subprocess
import sys
import time
from typing import Iterator
import os
import requests
import audioop
from pydub import AudioSegment
from types import SimpleNamespace


In [2]:

def convert_wav_chunk_to_ulaw_chunk(wav_chunk, sample_width=2): 
    # The sample_width parameter corresponds to the number of bytes used per sample, default is 2 for 16-bit audio
    
    if sample_width not in {1, 2, 4}:
        raise ValueError("sample_width must be 1, 2, or 4")
        
    # Convert the WAV audio chunk to u-Law encoding 
    try:
        ulaw_chunk = audioop.lin2ulaw(wav_chunk, sample_width) 
    except audioop.error as e:
        print(f"Error converting WAV chunk to u-Law: {e}")
        return None
    
    return ulaw_chunk


In [3]:

def is_installed(lib_name: str) -> bool:
    lib = shutil.which(lib_name)
    if lib is None:
        return False
    return True


In [4]:

def save(audio: bytes, filename: str) -> None:
    with open(filename, "wb") as f:
        f.write(audio)


In [5]:

def stream_ffplay(audio_stream, output_file, save=True):
    if not save:
        ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"]
    else:
        print("Saving to ", output_file)
        # ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", '-ar', '8000', output_file]
        ffplay_cmd = ["ffmpeg", "-probesize", "1024",'-c:a', 'pcm_mulaw', '-ar', '8000', output_file, "-i", '-']

    ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
    for chunk in audio_stream:
        if chunk is not None:
            ffplay_proc.stdin.write(chunk)

    # close on finish
    ffplay_proc.stdin.close()
    ffplay_proc.wait()


In [6]:

def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
    start = time.perf_counter()
    speaker["text"] = text
    speaker["language"] = language
    speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
    speaker["add_wav_header"] = False
    res = requests.post(
        f"{server_url}/tts_stream/ulaw",
        json=speaker,
        stream=True,
    )
    end = time.perf_counter()
    print(f"Time to make POST: {end-start}s", file=sys.stderr)

    if res.status_code != 200:
        print("Error:", res.text)
        sys.exit(1)

    first = True
    for chunk in res.iter_content(chunk_size=512):
        if first:
            end = time.perf_counter()
            print(f"Time to first chunk: {end-start}s", file=sys.stderr)
            first = False
        if chunk:
            yield chunk

    print("⏱️ response.elapsed:", res.elapsed)



In [7]:

def get_speaker(ref_audio,server_url):
    files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
    response = requests.post(f"{server_url}/clone_speaker", files=files)
    return response.json()


In [8]:
args = SimpleNamespace()
server_url = os.getenv("SERVER_URL", "http://15.237.142.70:8000")
text = "Mon nom est Yoann, et je pense que c'est vraiment sympa de manger des crêpes, vous ne trouvez pas ?"
language = 'fr'
ref_file = None
stream_chunk_size = 30
file_counter = 0
output_file = "./test_outputs/output_french"
while os.path.isfile(output_file+str(file_counter)+'.wav'):
    file_counter += 1
output_file = output_file+str(file_counter)+'.wav'

args.text = text
args.language = language
args.output_file = output_file
args.ref_file = ref_file
args.server_url = server_url
args.stream_chunk_size = stream_chunk_size


speaker_file_path = os.path.join(os.path.curdir,"french_speaker3.json")

with open(speaker_file_path, "r") as file:
    speaker = json.load(file)

if args.ref_file is not None:
    print("Computing the latents for a new reference...")
    speaker = get_speaker(args.ref_file, args.server_url)

In [15]:
import wave2
audio_stream = tts(
        args.text,
        speaker,
        args.language,
        args.server_url,
        args.stream_chunk_size
    )
# stream_ffplay(audio_stream,args.output_file,bool(args.output_file))

def ulaw_to_segment(segment: bytes) -> AudioSegment:
    audio_data = audioop.ulaw2lin(segment, 2)
    audio_data = audioop.ratecv(audio_data, 2, 1, 8000, 16000, None)[0]
    return AudioSegment(audio_data, frame_rate=16000, channels=1, sample_width=2)


with wave2.open("test2.ulaw", 'wb') as f:
    f.setcomptype('ULAW', 'CCITT G.711 u-law')
    f.setframerate(8000)
    f.setsampwidth(2)
    for i,chunk in enumerate(audio_stream):
        f.writeframes(chunk)

Exception ignored in: <function Wave_write.__del__ at 0x000001B4DC377E20>
Traceback (most recent call last):
  File "c:\Users\helpd\Documents\Dev\Diago\xtts-streaming-server\test\wave2.py", line 360, in __del__
    self.close()
  File "c:\Users\helpd\Documents\Dev\Diago\xtts-streaming-server\test\wave2.py", line 474, in close
    self._ensure_header_written(0)
  File "c:\Users\helpd\Documents\Dev\Diago\xtts-streaming-server\test\wave2.py", line 490, in _ensure_header_written
    raise Error('# channels not specified')
wave2.Error: # channels not specified


AttributeError: __enter__