https://github.com/Azure-Samples/aoai-realtime-audio-sdk

In [9]:
import asyncio
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
#from rtclient.models import UserMessageItem, InputTextContentPart, NoTurnDetection, ServerVAD, RTResponse
from rtclient import RTClient, RTResponse, ServerVAD, NoTurnDetection, UserMessageItem, InputTextContentPart,InputAudioTranscription
import numpy as np
import os
import sounddevice as sd
import base64
import time
import collections
import queue
load_dotenv(override=True)


azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")





In [2]:
async def simple_text():
    # Replace with your Azure key and endpoint
    azure_key = azure_openai_api_key
    azure_endpoint = azure_openai_endpoint
    azure_deployment = azure_openai_deployment
    
    # Create AzureKeyCredential for authentication
    key_credential = AzureKeyCredential(azure_key)

    # Instantiate the RTClient
    async with RTClient(url=azure_endpoint, key_credential=key_credential, azure_deployment=azure_deployment) as client:
        # Configure the client for text modality
        await client.configure(modalities={"text"}, turn_detection=NoTurnDetection())

        while True:
            user_input = input("You: ")
    
            # Stop the conversation if the user types "stop"
            if user_input.lower() == "stop":
                print("Conversation ended.")
                await client.close()
                break
    
            
            # Create the text message to send
            message = UserMessageItem(
                content=[InputTextContentPart(text=user_input)]
            )
    
            # Send the message
            await client.send_item(item=message)
    
            # Generate the response
            response = await client.generate_response()
    
            # Receive the response and handle it
            async for item in response:
                if item.type == "message":
                    async for part in item:
                        if part.type == "text":
                            text = ""
                            async for chunk in part.text_chunks():
                                text += chunk
                            print(f"Received text: {text}")



In [3]:
await simple_text()

You:  stop


Conversation ended.


In [11]:
async def process_response(client):
    while True:
        try:
            print("in process response")
            status = ""
             
            #response = await client.generate_response()
            #item = await anext(response)

            for _ in [1, 2]:
                item = await anext(client.events())
                if item.type == "input_audio":
                    input_item = item
                    print("input_item")
                    assert input_item is not None
                    await input_item
                    assert input_item.transcript is not None
                    assert len(input_item.transcript) > 0
        
                if item.type == "response":
                    response = item
                    print("response")



            assert response is not None
            item = await anext(response)
            assert item.type == "message"
    
            print(f"Response ID: {response.id}")
            print(f"Response Status: {response.status}")
            print(f"Response Status Details: {response.status_details}")
            print(f"Response Output: {response.output}")
            print(f"Response Usage: {response.usage}")
            if item.type == "message":
                async for part in item:
                    print(part.type)
                    if part.type == "audio":
                        if 1==2:
                            async for chunk in part.audio_chunks():
                                if len(chunk) % 2 != 0:
                                    print("Warning: Misaligned buffer size, trimming extra byte.")
                                    chunk = chunk[:-1]
                                audio_np = np.frombuffer(chunk, dtype=np.int16)
                                add_audio_chunk_to_queue(audio_np)
    
                        audio_data = b""
                        async for chunk in part.audio_chunks():
                            audio_data += chunk
                    
                        if len(audio_data) % 2 != 0:
                            print("Warning: Misaligned buffer size, trimming extra byte.")
                            audio_data = audio_data[:-1]
                    
                        audio_np = np.frombuffer(audio_data, dtype=np.int16)
                        play_audio(audio_np)
                        break
        except Exception as e:
            print(e)

In [12]:
# Helper function to record audio from microphone
async def record_and_send_audio(client, duration=5, fs=24000):
    print("Recording audio...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    audio =  np.squeeze(audio)
    await client.send_audio(audio)
    await client.commit_audio()
    




human_speaking = False
last_speech_time = time.time()
pre_roll_buffer = collections.deque(maxlen=20)  # Buffer to store pre-roll audio chunks

# Detect speech callback
def detect_speech(indata, frames, time1, status):
    global human_speaking, last_speech_time
    volume_norm = np.linalg.norm(indata) * 10
    pre_roll_buffer.append(indata.copy())  # Keep a rolling buffer of pre-speech audio chunks

    if volume_norm > 100000:  # Threshold for speech detection
        print("Speech detected!")
        human_speaking = True  # Mark that speech is happening
        last_speech_time = time.time()  # Reset the speech timer
    else:
        human_speaking = False  # No speech detected

# Stream audio, collating and sending in real-time
async def record_and_send_audio_streaming(client, fs=24000):
    print("Recording audio...")
    global human_speaking, last_speech_time
    last_speech_time = time.time()

    with sd.InputStream(samplerate=fs, channels=1, dtype='int16', callback=detect_speech):
        while True:
            if human_speaking:
                print("Sending audio...")
                
                # Pre-roll audio: Send chunks stored before speech detection
                pre_roll_audio = np.concatenate([np.squeeze(chunk) for chunk in pre_roll_buffer], axis=0)
                pre_roll_buffer.clear()  # Clear the pre-roll buffer after sending
                
                # While the user is speaking, record and send in real-time
                while human_speaking:
                    audio_chunk = sd.rec(int(0.5 * fs), samplerate=fs, channels=1, dtype='int16')
                    sd.wait()  # Wait for the chunk to be fully recorded
                    audio_chunk = np.squeeze(audio_chunk)
                    
                    # Concatenate pre-roll and new audio if pre-roll exists
                    if pre_roll_audio.size > 0:
                        full_audio = np.concatenate((pre_roll_audio, audio_chunk), axis=0)
                        pre_roll_audio = np.array([])  # Clear pre-roll after sending
                    else:
                        full_audio = audio_chunk
                    
                    # Send the real-time audio chunk
                    await client.send_audio(full_audio)
                    await asyncio.sleep(0.001)  # Small delay to avoid overwhelming the system

                    if time.time() - last_speech_time > 2:  # Silence threshold (2 seconds)
                        print("No speech detected for 2 seconds, stopping recording...")
                        break
                
            else:
                # Stop if no speech is detected for more than 5 seconds
                if time.time() - last_speech_time > 3:
                    print("No speech for 3 seconds, stopping...")
                    break

            await asyncio.sleep(0.001)  # Wait a little before checking again

    # Commit the audio after sending
    await client.commit_audio()

In [13]:
async def play_audio_part(part):
    audio_data = b""
    async for chunk in part.audio_chunks():
        audio_data += chunk

    if len(audio_data) % 2 != 0:
        print("Warning: Misaligned buffer size, trimming extra byte.")
        audio_data = audio_data[:-1]

    audio_np = np.frombuffer(audio_data, dtype=np.int16)
    play_audio(audio_np)

async def play_audio_part_streaming(part):
    audio_data = b""
    async for chunk in part.audio_chunks():
        if len(chunk) % 2 != 0:
            print("Warning: Misaligned buffer size, trimming extra byte.")
            chunk = chunk[:-1]
        audio_np = np.frombuffer(chunk, dtype=np.int16)
        play_audio(audio_np)

# Helper function to play audio output
def play_audio(audio):
    sd.play(audio, samplerate=24000)
    sd.wait()

audio_queue = queue.Queue()

# Callback function to play chunks

def audio_callback(outdata, frames, time, status):
    if status:
        print(status)
    try:
        audio_chunk = audio_queue.get_nowait()  # Get the next chunk from the queue

        # If the chunk is smaller than the buffer size, pad with zeros
        if len(audio_chunk) < frames:
            audio_chunk = np.pad(audio_chunk, (0, frames - len(audio_chunk)), 'constant')
        elif len(audio_chunk) > frames:
            audio_chunk = audio_chunk[:frames]

        outdata[:] = audio_chunk.reshape(-1, 1)  # Reshape for stereo/mono output
    except queue.Empty:
        outdata.fill(0)  # If no audio is available, play silence

# Function to play audio continuously using callback
async def play_audio_continuous(samplerate=24000, channels=1, blocksize=1024):
    with sd.OutputStream(samplerate=samplerate, channels=channels, dtype='int16',
                         blocksize=blocksize, callback=audio_callback):
        while True:
            await asyncio.sleep(0.1)  # Keep the loop running

# Function to add chunks of audio to the queue
def add_audio_chunk_to_queue(audio_chunk):
    audio_queue.put(audio_chunk)



In [14]:

user_input = ""
async def simple_audio():
    global user_input
    # Replace with your Azure key and endpoint
    azure_key = azure_openai_api_key
    azure_endpoint = azure_openai_endpoint
    azure_deployment = azure_openai_deployment
    
    # Create AzureKeyCredential for authenticationC
    key_credential = AzureKeyCredential(azure_key)

    # Instantiate the RTClient
    async with RTClient(url=azure_endpoint, key_credential=key_credential, azure_deployment=azure_deployment) as client:
        # Configure the client for audio modality
        instructions = "You speak like a person who lives in New Delhi in India in Hindi. You speak in a female voice. You speak 1 sentence in one response. You sound enthusiastic and curious and helpful. "

        await client.configure(modalities={"audio","text"}, turn_detection=ServerVAD(), voice="alloy", instructions=instructions,
                              input_audio_transcription=InputAudioTranscription(model="whisper-1"))
        turn_detection=ServerVAD(),
        
        while True:
            user_input = input("Continue?")
            if user_input == "stop":
                break
            try:
                await record_and_send_audio_streaming(client)
                print("here")
                await process_response(client)           
            except Exception as e:
                await client.close()
                print(e)






In [15]:
await(simple_audio())

Continue? hi


Recording audio...
Speech detected!
Sending audio...
Speech detected!
Speech detected!
Speech detected!
Speech detected!
Speech detected!
Speech detected!
Speech detected!
Speech detected!
No speech for 3 seconds, stopping...
here
in process response
input_item


CancelledError: 

In [6]:
try:
    main_task = asyncio.create_task(simple_audio())
    record_task = asincio.create_task(record_and_send_audio_streaming(client))
    #play_task = asyncio.create_task(play_audio_continuous())
    await asyncio.gather(main_task, record_task)
    #await(simple_audio())
except Exception:
    print("\nProgram interrupted by user (Ctrl+C).")


Program interrupted by user (Ctrl+C).
Continue?

In [None]:
import sounddevice as sd
import numpy as np
import asyncio
import queue

# Create a thread-safe queue to hold audio chunks
audio_queue = queue.Queue()

def audio_callback(outdata, frames, time, status):
    """
    This callback is called in the background to provide audio chunks to the OutputStream.
    It fetches data from the queue and outputs it for playback.
    """
    if status:
        print(status)
    try:
        # Get the next chunk of audio from the queue
        audio_chunk = audio_queue.get_nowait()

        # Ensure the chunk matches the block size
        if len(audio_chunk) < frames:
            # If the chunk is smaller than the block size, pad with zeros
            audio_chunk = np.pad(audio_chunk, (0, frames - len(audio_chunk)), 'constant')
        elif len(audio_chunk) > frames:
            # If the chunk is larger, truncate it
            audio_chunk = audio_chunk[:frames]

        # Reshape to match the output format
        outdata[:] = audio_chunk.reshape(-1, 1)

    except queue.Empty:
        # If no data is available, fill with silence (zeros)
        outdata.fill(0)

async def play_audio_continuous(samplerate=24000, channels=1, blocksize=1024):
    """
    Start a continuous audio playback stream using audio chunks from a queue.
    """
    with sd.OutputStream(samplerate=samplerate, channels=channels, dtype='int16',
                         blocksize=blocksize, callback=audio_callback):
        while True:
            await asyncio.sleep(0.1)  # Keep the async loop alive while playing audio

def add_audio_chunk_to_queue(audio_chunk):
    """
    Add a new audio chunk to the playback queue.
    """
    audio_queue.put(audio_chunk)

async def process_audio_events(client):
    """
    Continuously receive audio chunks from the server in real-time and queue them for playback.
    """
    try:
        async for event in client.events():  # Subscribe to the event loop
            if isinstance(event, RTResponse):  # If the event is a response, process it
                async for item in event:
                    if item.type == "message":
                        async for part in item:
                            if part.type == "audio":
                                # Process audio chunks as they come in
                                async for chunk in part.audio_chunks():
                                    # Convert the audio chunk to a NumPy array
                                    audio_chunk = np.frombuffer(chunk, dtype=np.int16)

                                    # Add the audio chunk to the playback queue
                                    add_audio_chunk_to_queue(audio_chunk)

    except Exception as e:
        print(f"Error during event processing: {e}")

async def low_latency_audio():
    """
    The main function to configure the client, start streaming, and process the audio events.
    """
    # Replace with your Azure key and endpoint
    azure_key = azure_openai_api_key
    azure_endpoint = azure_openai_endpoint
    azure_deployment = azure_openai_deployment
    
    # Create AzureKeyCredential for authentication
    key_credential = AzureKeyCredential(azure_key)

    # Instantiate the RTClient
    async with RTClient(url=azure_endpoint, key_credential=key_credential, azure_deployment=azure_deployment) as client:
        # Configure the client for audio modality
        instructions = "You speak like a person who lives in New Delhi in India in Hindi. You speak in a female voice. You sound enthusiastic and curious and helpful."
        await client.configure(modalities={"audio", "text"}, turn_detection=ServerVAD(), voice="alloy", instructions=instructions)

        # Create asynchronous tasks for streaming audio and processing events
        stream_task = asyncio.create_task(stream_audio(client))
        event_task = asyncio.create_task(process_audio_events(client))
        playback_task = asyncio.create_task(play_audio_continuous())

        # Wait for all tasks to complete
        await asyncio.gather(stream_task, event_task, playback_task)

# For Jupyter Notebooks, use this approach instead of asyncio.run()
try:
    await low_latency_audio()  # Use await directly in a Jupyter Notebook environment
except KeyboardInterrupt:
    print("\nProgram interrupted by user (Ctrl+C).")



In [None]:
import webrtcvad
import sounddevice as sd
import numpy as np
import asyncio
import base64





# Helper function to play audio chunk by chunk
def play_audio_chunk(pcm_data):
    try:
        sd.play(pcm_data, samplerate=24000)  # Assuming a 24000 Hz sample rate
        sd.wait()
    except Exception as e:
        print(f"Error while playing audio chunk: {e}")

# Process the incoming chunk from base64 and play it
def process_audio_chunk(chunk):
    try:
        pcm_data = np.frombuffer(chunk, dtype=np.int16)  # Convert the audio bytes to PCM
        play_audio_chunk(pcm_data)
    except Exception as e:
        print(f"Error while processing audio chunk: {e}")

async def stream_audio(client, chunk_duration=0.5, fs=24000):
    """
    Streams audio from the microphone in small chunks and sends it to the client.
    
    Parameters:
    chunk_duration (float): The length of each audio chunk in seconds.
    fs (int): The sampling rate for audio.
    """
    global human_speaking
    buffer_size = int(chunk_duration * fs)  # Calculate buffer size for each chunk in samples
    vad = webrtcvad.Vad(0)  # Initialize VAD

    # Use an InputStream to capture audio chunk by chunk
    with sd.InputStream(samplerate=fs, channels=1, dtype='int16', blocksize=buffer_size, callback=detect_speech) as stream:
        print("Streaming audio...")
        while True:
            try:
                # Read a chunk of audio from the microphone
                audio_chunk, _ = stream.read(buffer_size)
                audio_chunk = np.squeeze(audio_chunk)  # Ensure it's a 1D array

                # Use VAD to check if the chunk contains human speec

                if human_speaking:
                    # Send the audio chunk to the server
                    print("Speech detected, sending audio chunk")
                    await client.send_audio(audio_chunk.tobytes())
                    human_speaking = False
                else:
                    print("No speech detected, skipping chunk")

                # Introduce a short sleep to prevent overwhelming the system
                await asyncio.sleep(0.1)

            except Exception as e:
                print(f"Error while streaming audio: {e}")
                break

async def process_audio_events(client):
    """
    Continuously receive audio chunks from the server in real-time and play them chunk by chunk.
    """
    try:
        async for event in client.events():  # Subscribe to the event loop
            if isinstance(event, RTResponse):  # If the event is a response, process it
                async for item in event:
                    if item.type == "message":
                        async for part in item:
                            if part.type == "audio":
                                # Process audio chunks as they come in
                                async for chunk in part.audio_chunks():
                                    process_audio_chunk(chunk)

    except Exception as e:
        print(f"Error during event processing: {e}")

async def low_latency_audio():
    """
    The main function to configure the client, start streaming and process the audio events.
    """
    # Replace with your Azure key and endpoint
    azure_key = azure_openai_api_key
    azure_endpoint = azure_openai_endpoint
    azure_deployment = azure_openai_deployment
    
    # Create AzureKeyCredential for authentication
    key_credential = AzureKeyCredential(azure_key)

    # Instantiate the RTClient
    async with RTClient(url=azure_endpoint, key_credential=key_credential, azure_deployment=azure_deployment) as client:
        # Configure the client for audio modality
        instructions = "You speak like a person who lives in New Delhi in India in Hindi. You speak in a female voice. You sound enthusiastic and curious and helpful."
        await client.configure(modalities={"audio", "text"}, turn_detection=ServerVAD(), voice="alloy", instructions=instructions)

        # Create asynchronous tasks for streaming audio and processing events
        stream_task = asyncio.create_task(stream_audio(client))
        event_task = asyncio.create_task(process_audio_events(client))

        # Wait for both tasks to complete (which is effectively "forever" unless an error occurs)
        await asyncio.gather(stream_task, event_task)

# For Jupyter Notebooks, use this approach instead of asyncio.run()
try:
    await low_latency_audio()  # Use await directly in a Jupyter Notebook environment
except KeyboardInterrupt:
    print("\nProgram interrupted by user (Ctrl+C).")


In [None]:
import base64
import numpy as np
import sounddevice as sd
import asyncio

# Define a callback to write audio chunks to an open stream
def audio_callback(outdata, frames, time, status, pcm_data_queue):
    try:
        # Get the next chunk of PCM data from the queue
        if not pcm_data_queue.empty():
            chunk = pcm_data_queue.get_nowait()
            # Write the audio chunk to the output stream
            outdata[:] = chunk.reshape(outdata.shape)
        else:
            outdata.fill(0)  # If no audio chunk is available, fill with silence
    except Exception as e:
        print(f"Error in audio callback: {e}")
        outdata.fill(0)  # On error, fill with silence

# Initialize a queue to hold PCM audio data
pcm_data_queue = asyncio.Queue()

# Function to play audio chunks via a non-blocking stream
async def play_audio_stream(fs=24000, channels=1):
    """
    Play audio in a non-blocking stream. Continuously pulls chunks from the queue and plays them.
    """
    with sd.OutputStream(samplerate=fs, channels=channels, dtype='int16', callback=lambda outdata, frames, time, status: audio_callback(outdata, frames, time, status, pcm_data_queue)):
        print("Audio stream is playing...")
        await asyncio.Event().wait()  # Keep the stream open forever or until canceled

# Function to process the incoming base64-encoded audio chunk
async def process_audio_chunk(chunk):
    

    # Convert the bytes to 16-bit signed integers (int16)
    pcm_data = np.frombuffer(chunk, dtype=np.int16)

    # Put the PCM data into the queue for playback
    await pcm_data_queue.put(pcm_data)

# Function to handle real-time audio events from the client
async def process_audio_events(client):
    """
    Continuously receive audio chunks from the server in real-time and enqueue them for playback.
    """
    try:
        async for event in client.events():  # Subscribe to the event loop
            if isinstance(event, RTResponse):  # If the event is a response, process it
                async for item in event:
                    if item.type == "message":
                        async for part in item:
                            if part.type == "audio":
                                # Process audio chunks as they come in
                                async for chunk in part.audio_chunks():
                                    await process_audio_chunk(chunk)
    except Exception as e:
        print(f"Error during event processing: {e}")

# Function to handle audio streaming (recording) and sending it to the client
async def stream_audio(client, chunk_duration=0.5, fs=24000):
    """
    Streams audio from the microphone in small chunks and sends it to the client.
    """
    buffer_size = int(chunk_duration * fs)  # Calculate buffer size for each chunk in samples
    
    # Use an InputStream to capture audio chunk by chunk
    with sd.InputStream(samplerate=fs, channels=1, dtype='int16', blocksize=buffer_size) as stream:
        print("Streaming audio...")
        while True:
            try:
                # Read a chunk of audio from the microphone
                audio_chunk, _ = stream.read(buffer_size)
                audio_chunk = np.squeeze(audio_chunk)

                # Send the audio chunk to the server
                await client.send_audio(audio_chunk.tobytes())

                # Introduce a short sleep to prevent overwhelming the system
                await asyncio.sleep(0.1)

            except Exception as e:
                print(f"Error while streaming audio: {e}")
                break

# Main function to initialize and run the low-latency audio system
async def low_latency_audio():
    """
    The main function to configure the client, start streaming and process the audio events.
    """
    # Replace with your Azure key and endpoint
    azure_key = azure_openai_api_key
    azure_endpoint = azure_openai_endpoint
    azure_deployment = azure_openai_deployment
    
    # Create AzureKeyCredential for authentication
    key_credential = AzureKeyCredential(azure_key)

    # Instantiate the RTClient
    async with RTClient(url=azure_endpoint, key_credential=key_credential, azure_deployment=azure_deployment) as client:
        # Configure the client for audio modality
        instructions = "You speak like a person who lives in New Delhi in India in Hindi. You speak in a female voice. You sound enthusiastic and curious and helpful."
        await client.configure(modalities={"audio", "text"}, turn_detection=ServerVAD(), voice="alloy", instructions=instructions)

        # Create asynchronous tasks for streaming audio and processing events
        stream_task = asyncio.create_task(stream_audio(client))
        play_audio_task = asyncio.create_task(play_audio_stream())  # Play the audio stream
        event_task = asyncio.create_task(process_audio_events(client))

        # Wait for both tasks to complete (which is effectively "forever" unless an error occurs)
        await asyncio.gather(stream_task, event_task, play_audio_task)

# For Jupyter Notebooks, use this approach instead of asyncio.run()
try:
    await low_latency_audio()  # Use await directly in a Jupyter Notebook environment
except KeyboardInterrupt:
    print("\nProgram interrupted by user (Ctrl+C).")
