# Creating a Virtual Webcam Chatbot That Responds to Voice Commands in a Video Meeting
In this notebook we test all the functions of the program

## Converting Audio to Text
Using IBM Watson Speech to Text, we'll convert the captured audio into text. 


In [1]:
import pyaudio
import io
import os
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from dotenv import load_dotenv
load_dotenv()
IBM_SPEECH_TO_TEXT_API = os.getenv('IBM_SPEECH_TO_TEXT_API')
URL = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'

In [20]:
authenticator = IAMAuthenticator(IBM_SPEECH_TO_TEXT_API)
speech_to_text = SpeechToTextV1(authenticator=authenticator)
speech_to_text.set_service_url(URL)
help(speech_to_text.recognize)

Help on method recognize in module ibm_watson.speech_to_text_v1:

recognize(audio: <class 'BinaryIO'>, *, content_type: str = None, model: str = None, language_customization_id: str = None, acoustic_customization_id: str = None, base_model_version: str = None, customization_weight: float = None, inactivity_timeout: int = None, keywords: List[str] = None, keywords_threshold: float = None, max_alternatives: int = None, word_alternatives_threshold: float = None, word_confidence: bool = None, timestamps: bool = None, profanity_filter: bool = None, smart_formatting: bool = None, speaker_labels: bool = None, grammar_name: str = None, redaction: bool = None, audio_metrics: bool = None, end_of_phrase_silence_time: float = None, split_transcript_at_phrase_end: bool = None, speech_detector_sensitivity: float = None, background_audio_suppression: float = None, low_latency: bool = None, character_insertion_bias: float = None, **kwargs) -> ibm_cloud_sdk_core.detailed_response.DetailedResponse metho

In [9]:
# Function to convert audio to text
def convert_audio_to_text(audio_data):
    authenticator = IAMAuthenticator(IBM_SPEECH_TO_TEXT_API)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(URL)

    response = speech_to_text.recognize(
        audio=audio_data,
        content_type='audio/l16; rate=44100',
        model='en-US_BroadbandModel',
       # max_alternatives=1,
    ).get_result()

    return response['results'][0]['alternatives'][0]['transcript']

## Create a Function to Transcribe Audio
We need a function that captures audio from the microphone in flac format and converts it into text using the 

In [44]:
# Function to record audio from the microphone
def record_audio():
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 5
    audio = pyaudio.PyAudio()
    stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    print("Recording...")
    frames = []
    for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording")
    stream.stop_stream()
    stream.close()
    audio.terminate()
    audio_data = io.BytesIO(b''.join(frames))
    return audio_data

In [45]:
audio_data = record_audio()

Recording...
Finished recording


In [17]:
type(audio_data)

_io.BytesIO

In [18]:
transcript = convert_audio_to_text(audio_data)

In [19]:
transcript

'can you recently sent '

# New Code Begin here

In [None]:
import sounddevice as sd
import soundfile as sf
import io
def play_audio(audio_object):
    # Load the audio data from the byte stream object
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    # Play the audio using sounddevice
    sd.play(audio_data, sample_rate)
    sd.wait()


In [75]:
import sounddevice as sd
import soundfile as sf
import io
def record_audio(duration):
    # Set the sample rate and channels
    sample_rate = 48000  # You can change this to your desired sample rate
    channels = 2  # 1 for mono, 2 for stereo
    # Start recording audio from the microphone
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=channels)
    # Wait for the recording to complete
    print("Recording...")
    sd.wait()
    # Create a byte stream object to store the audio data
    audio_stream = io.BytesIO()
    # Save the recorded audio to the byte stream in the OGG format with Opus codec
    sf.write(audio_stream, audio, sample_rate, format='ogg', subtype='opus')
    # Move the file pointer to the beginning of the stream
    audio_stream.seek(0)
    return audio_stream

#https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats

In [76]:
# Example usage: record audio for 5 seconds and get the audio as an object
duration = 5
audio_object = record_audio(duration)


Recording...


In [77]:
audio_object

<_io.BytesIO at 0x258ffaed6c0>

In [78]:
# Example usage: play the audio from the audio object
play_audio(audio_object)

In [31]:
import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
def transcribe_audio(audio_object, api_key, url):
    # Create an authenticator object with your IBM Watson Speech to Text API key
    authenticator = IAMAuthenticator(api_key)
    # Create a Speech to Text object with the authenticator and the service URL
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    # Perform the transcription using the audio object
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',     
    ).get_result()

    # Get the transcribed text from the response
    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text

 #https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng     

In [80]:
# Example usage: transcribe the audio object using IBM Watson Speech to Text
from dotenv import load_dotenv
load_dotenv()
api_key= os.getenv('IBM_SPEECH_TO_TEXT_API')
url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'

In [81]:
#audio_object = record_audio(5)  # Assuming you have a function to record audio and return it as an object
transcription = transcribe_audio(audio_object, api_key, url)
print(transcription)

hello can you listen me  


In [82]:
def test_transcription():
    # Example usage: transcribe the audio object using IBM Watson Speech to Text
    from dotenv import load_dotenv
    load_dotenv()
    api_key= os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'
    # Example usage: record audio for 5 seconds and get the audio as an object
    duration = 5
    audio_object = record_audio(duration)    
    transcription = transcribe_audio(audio_object, api_key, url)
    print("Transcript:", transcription)    
    

In [83]:
test_transcription()

Recording...
Transcript: hello can you listen me  


In [30]:
import io
import json
import os
import queue
import threading
import time

import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1

# Play audio using sounddevice
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

# Record audio and put audio chunks in the queue
def record_audio(duration, audio_queue):
    sample_rate = 48000
    channels = 2
    chunk_duration = 0.5
    total_chunks = int(duration / chunk_duration)

    for _ in range(total_chunks):
        audio = sd.rec(int(chunk_duration * sample_rate), samplerate=sample_rate, channels=channels)
        sd.wait()
        audio_queue.put(audio)

# Transcribe audio using IBM Watson Speech to Text
def transcribe_audio(audio_object, api_key, url):
    authenticator = IAMAuthenticator(api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text

# Main function to record audio in a separate thread and transcribe it in real-time
def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'
    duration = 5
    sample_rate = 48000  # Add this line to define sample_rate
    audio_queue = queue.Queue()
    audio_thread = threading.Thread(target=record_audio, args=(duration, audio_queue))
    audio_thread.start()
    while audio_thread.is_alive() or not audio_queue.empty():
        if not audio_queue.empty():
            audio_chunk = audio_queue.get()
            audio_stream = io.BytesIO()
            sf.write(audio_stream, audio_chunk, samplerate=sample_rate, format='ogg', subtype='opus')
            audio_stream.seek(0)
            # Example usage: play the audio from the audio object
            play_audio(audio_stream)
            print("Running:")
            #transcription = transcribe_audio(audio_stream, api_key, url)
            #print("Transcript:", transcription)
        else:
            time.sleep(0.1)
    audio_thread.join()
if __name__ == "__main__":
    main()    

Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:


Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:
Running:


In [6]:
import io
import json
import os
import queue
import threading
import time
from dotenv import load_dotenv
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1
import sounddevice as sd
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

def record_audio(duration, audio_queue):
    sample_rate = 48000
    channels = 2
    chunk_duration = 0.5
    total_chunks = int(duration / chunk_duration)

    for _ in range(total_chunks):
        audio = sd.rec(int(chunk_duration * sample_rate), samplerate=sample_rate, channels=channels)
        sd.wait()
        audio_queue.put(audio)

def transcribe_audio(audio_object, api_key, url):
    authenticator = IAMAuthenticator(api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text

def contains_voice(audio_stream, silence_threshold=-50, min_duration=100):
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")
    nonsilent_chunks = detect_nonsilent(audio_segment, min_silence_len=min_duration, silence_thresh=silence_threshold)

    if len(nonsilent_chunks) > 0:
        return True
    else:
        return False

def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'

    duration = 5
    sample_rate = 48000
    audio_queue = queue.Queue()
    audio_thread = threading.Thread(target=record_audio, args=(duration, audio_queue))
    audio_thread.start()

    while audio_thread.is_alive() or not audio_queue.empty():
        if not audio_queue.empty():
            audio_chunk = audio_queue.get()
            audio_stream = io.BytesIO()
            sf.write(audio_stream, audio_chunk, samplerate=sample_rate, format='ogg', subtype='opus')
            audio_stream.seek(0)

            if contains_voice(audio_stream):
                transcription = transcribe_audio(audio_stream, api_key, url)
                print("Transcript:", transcription)
            else:
                print("No voice detected")
        else:
            time.sleep(0.1)

    audio_thread.join()

if __name__ == "__main__":
    main()

No voice detected
No voice detected
No voice detected
No voice detected


ApiException: Error: Stream was 0 bytes but needs to be at least 100 bytes., Code: 400 , X-global-transaction-id: 11be997e-a8bb-42ed-a634-88fbf4b51040

Can you create a new full code with a new main function that is listening the microphone all the time and only if recieves voice play the  complete voice , this is for streaming in real time, when there is a voice the voice is reproduced again

In [20]:

import io
import json
import os
import queue
import threading
import time

import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1

# Play audio using sounddevice
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

# Record audio and put audio chunks in the queue
def record_audio(duration, audio_queue):
    sample_rate = 48000
    channels = 2
    chunk_duration = 0.5
    total_chunks = int(duration / chunk_duration)

    for _ in range(total_chunks):
        audio = sd.rec(int(chunk_duration * sample_rate), samplerate=sample_rate, channels=channels)
        sd.wait()
        audio_queue.put(audio)

# Transcribe audio using IBM Watson Speech to Text
def transcribe_audio(audio_object, api_key, url):
    authenticator = IAMAuthenticator(api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text

# Main function to record audio in a separate thread and transcribe it in real-time
def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'
    duration = 5
    sample_rate = 48000  # Add this line to define sample_rate
    audio_queue = queue.Queue()
    audio_thread = threading.Thread(target=record_audio, args=(duration, audio_queue))
    audio_thread.start()

    while audio_thread.is_alive() or not audio_queue.empty():
        if not audio_queue.empty():
            audio_chunk = audio_queue.get()
            audio_stream = io.BytesIO()
            sf.write(audio_stream, audio_chunk, samplerate=sample_rate, format='ogg', subtype='opus')
            audio_stream.seek(0)

            if contains_voice(audio_stream):
                # Play the audio when voice is detected
                play_audio(audio_stream)
                print("Running:")
                #transcription = transcribe_audio(audio_stream, api_key, url)
                #print("Transcript:", transcription)
            else:
                print("No voice detected")
        else:
            time.sleep(0.1)

    audio_thread.join()

if __name__ == "__main__":
    main()


No voice detected
No voice detected
Running:
Running:
No voice detected
No voice detected
No voice detected
No voice detected
Running:
Running:


In [38]:
import io
import json
import os
import queue
import threading
import time
import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

# Play audio using sounddevice
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

# Record audio for the entire duration
def record_audio(duration, sample_rate):
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
    sd.wait()
    return audio

# Transcribe audio using IBM Watson Speech to Text
def transcribe_audio(audio_object, api_key, url):
    authenticator = IAMAuthenticator(api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text
    

# Function to check if a chunk contains_voice
def contains_voice(audio_stream, silence_threshold=-50, min_duration=100):
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")
    nonsilent_chunks = detect_nonsilent(audio_segment, min_silence_len=min_duration, silence_thresh=silence_threshold)

    if len(nonsilent_chunks) > 0:
        return True
    else:
        return False

    
# Main function to record audio, split it based on silence, and transcribe it in real-time
def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'
    duration = 10
    sample_rate = 48000
    silence_duration = 1000  # Minimum duration of silence in milliseconds to split chunks

    # Record the entire audio
    audio_data = record_audio(duration, sample_rate)
    audio_stream = io.BytesIO()
    sf.write(audio_stream, audio_data, samplerate=sample_rate, format='ogg', subtype='opus')
    audio_stream.seek(0)
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")

    # Split audio based on silence
    nonsilent_chunks = detect_nonsilent(audio_segment, min_silence_len=silence_duration, silence_thresh=-50)
    audio_chunks = [audio_segment[start:end] for (start, end) in nonsilent_chunks]

    # Process each audio chunk
    for audio_chunk in audio_chunks:
        audio_stream = io.BytesIO()
        audio_chunk.export(audio_stream, format="ogg")
        audio_stream.seek(0)
        print("Waiting voice")
        if contains_voice(audio_stream):
            # Play the audio when voice is detected
            print("Running with voice:")
            play_audio(audio_stream)
            
            #transcription = transcribe_audio(audio_stream, api_key, url)
            #print("Transcript:", transcription)
        else:
            print("No voice detected")

if __name__ == "__main__":
    main()


Waiting voice
Running with voice:
Waiting voice
Running with voice:
Waiting voice
Running with voice:
Waiting voice
Running with voice:


In [39]:
import io
import json
import os
import queue
import threading
import time
import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

# Play audio using sounddevice
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

# Record audio for the entire duration
def record_audio(duration, sample_rate):
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
    sd.wait()
    return audio

# Transcribe audio using IBM Watson Speech to Text
def transcribe_audio(audio_object, api_key, url):
    authenticator = IAMAuthenticator(api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '
    return text
    

# Function to check if a chunk contains_voice
def contains_voice(audio_stream, silence_threshold=-50, min_duration=100):
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")
    nonsilent_chunks = detect_nonsilent(audio_segment, min_silence_len=min_duration, silence_thresh=silence_threshold)

    if len(nonsilent_chunks) > 0:
        return True
    else:
        return False


# Main function to record audio, split it based on silence, and transcribe it in real-time
def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'
    duration = 10
    sample_rate = 48000
    silence_duration = 1000  # Minimum duration of silence in milliseconds to split chunks
    # Record the entire audio
    audio_data = record_audio(duration, sample_rate)
    audio_stream = io.BytesIO()
    sf.write(audio_stream, audio_data, samplerate=sample_rate, format='ogg', subtype='opus')
    audio_stream.seek(0)
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")
    # Play the audio_segment before splitting it
    audio_stream.seek(0)
    play_audio(audio_stream)
if __name__ == "__main__":
    main()    

In [42]:
import io
import queue
import threading
import sounddevice as sd
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

# Play audio using sounddevice
def play_audio(audio_object):
    audio_data, sample_rate = sf.read(io.BytesIO(audio_object.getvalue()))
    sd.play(audio_data, sample_rate)
    sd.wait()

# Function to check if a chunk contains_voice
def contains_voice(audio_stream, silence_threshold=-50, min_duration=100):
    audio_segment = AudioSegment.from_file(audio_stream, format="ogg")
    nonsilent_chunks = detect_nonsilent(audio_segment, min_silence_len=min_duration, silence_thresh=silence_threshold)

    if len(nonsilent_chunks) > 0:
        return True
    else:
        return False

# Callback function to process audio in real-time
def callback(indata, frames, time, status, q):
    audio_stream = io.BytesIO()
    sf.write(audio_stream, indata, samplerate=48000, format='ogg', subtype='opus')
    audio_stream.seek(0)
    if contains_voice(audio_stream):
        q.put(audio_stream)

# Main function to listen to microphone and play audio when voice is detected
def main():
    sample_rate = 48000
    chunk_size = 2048

    q = queue.Queue()
    with sd.InputStream(samplerate=sample_rate, channels=2, blocksize=chunk_size, callback=lambda i, f, t, s: callback(i, f, t, s, q)):
        print("Listening...")
        while True:
            audio_stream = q.get()
            play_audio(audio_stream)

if __name__ == "__main__":
    main()


Listening...


Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_tell at 0x0000022A21ACB130>:
Traceback (most recent call last):
  File "C:\Users\066226758\Blog\Virtual-Webcam-Chatbot\.venv\lib\site-packages\soundfile.py", line 1264, in vio_tell
    @_ffi.callback("sf_vio_tell")
KeyboardInterrupt: 


LibsndfileError: Error opening <_io.BytesIO object at 0x0000022A1F434DB0>: Unspecified internal error.

In [91]:

import pyaudio
import queue
import threading
import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from dotenv import load_dotenv
import os

# Global variables
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
AUDIO_QUEUE = queue.Queue()
STOP_FLAG = threading.Event()

def audio_capture(stream):
    while not STOP_FLAG.is_set():
        audio_data = stream.read(CHUNK)
        AUDIO_QUEUE.put(audio_data)

def transcribe_audio(audio_object, api_key, url):
    # Create an authenticator object with your IBM Watson Speech to Text API key
    authenticator = IAMAuthenticator(api_key)

    # Create a Speech to Text object with the authenticator and the service URL
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)

    # Perform the transcription using the audio object
    response = speech_to_text.recognize(
        audio=audio_object,
        content_type='audio/ogg',
        model='en-US_Telephony',
    ).get_result()

    # Get the transcribed text from the response
    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '

    return text

def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'

    # Create a PyAudio object for audio capture
    audio = pyaudio.PyAudio()

    # Open an audio stream for capturing
    stream = audio.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK
    )

    # Start a separate thread for audio capture
    capture_thread = threading.Thread(target=audio_capture, args=(stream,))
    capture_thread.start()

    # Continuously transcribe audio from the queue
    while True:
        if not AUDIO_QUEUE.empty():
            audio_data = AUDIO_QUEUE.get()

            # Create a byte stream object to store the audio data
            audio_stream = io.BytesIO()
            sf.write(audio_stream, audio_data, RATE, format='ogg', subtype='opus')

            # Transcribe the audio
            transcription = transcribe_audio(audio_stream, api_key, url)
            print(transcription)



In [92]:
#if __name__ == '__main__':
#    main()


In [93]:
main()

IndexError: tuple index out of range

In [119]:

import pyaudio
import queue
import threading
import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from dotenv import load_dotenv
import os
import soundfile as sf
import io

# Global variables
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE =  8000 
AUDIO_QUEUE = queue.Queue()
STOP_FLAG = threading.Event()

def audio_capture(stream):
    while not STOP_FLAG.is_set():
        audio_data = stream.read(CHUNK)
        AUDIO_QUEUE.put(audio_data)

def transcribe_audio(audio_object, api_key, url):
    # Create an authenticator object with your IBM Watson Speech to Text API key
    authenticator = IAMAuthenticator(api_key)

    # Create a Speech to Text object with the authenticator and the service URL
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(url)

    # Perform the transcription using the audio object
    response = speech_to_text.recognize(
        audio=audio_object.getvalue(),
        content_type='audio/l16; rate=48000; endianness=little',
        model='en-US_Telephony',
    ).get_result()

    # Get the transcribed text from the response
    transcriptions = response['results']
    text = ''
    for transcription in transcriptions:
        text += transcription['alternatives'][0]['transcript'] + ' '

    return text

def main():
    load_dotenv()
    api_key = os.getenv('IBM_SPEECH_TO_TEXT_API')
    url = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/8006ee0b-b4ed-4e2c-b73e-78f0b03175b4'

    # Create a PyAudio object for audio capture
    audio = pyaudio.PyAudio()

    # Open an audio stream for capturing
    stream = audio.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK
    )

    # Start a separate thread for audio capture
    capture_thread = threading.Thread(target=audio_capture, args=(stream,))
    capture_thread.start()

    # Continuously transcribe audio from the queue
    while True:
        if not AUDIO_QUEUE.empty():
            audio_data = AUDIO_QUEUE.get()

            # Create a byte stream object to store the audio data
            audio_stream = io.BytesIO(audio_data)

            # Transcribe the audio
            transcription = transcribe_audio(audio_stream, api_key, url)
            print(transcription)


if __name__ == '__main__':
    main()

ApiException: Error: Unable to transcode from audio/l16; rate=48000; endianness=little to one of: audio/l16; rate=8000; channels=1, application/srgs, application/srgs+xml, Code: 415 , X-global-transaction-id: aca95c48-b946-4d7d-8840-c8d27470d14b

In [None]:
import speech_recognition as sr

def listen_and_transcribe(source_type="microphone"):
    recognizer = sr.Recognizer()
    if source_type == "microphone":
        with sr.Microphone() as source:
            print("Listening...")
            audio = recognizer.listen(source)
    elif source_type == "virtual_audio_cable":
        with sr.Microphone(device_index=1) as source:  # Adjust device_index based on your system configuration
            print("Listening...")
            audio = recognizer.listen(source)

    try:
        text = recognizer.recognize_google(audio)
        print(f"Transcribed: {text}")
        return text
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
    return None
```

In [1]:
import pyaudio
import wave
import threading
import queue

In [2]:
# Audio settings
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5

# Create audio stream queue
audio_queue = queue.Queue()

# Create a flag to indicate when to trigger ChatGPT
trigger_chatgpt = False

# Function to capture audio and add it to the queue
def capture_audio():
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("Capturing audio...")

    while True:
        frames = []
        for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)
        audio_queue.put(b''.join(frames))

        # Check if the word "computer" is said
        if b'computer' in b''.join(frames):
            trigger_chatgpt = True

    stream.stop_stream()
    stream.close()
    p.terminate()

In [3]:
# Start capturing audio in a separate thread
audio_thread = threading.Thread(target=capture_audio)
audio_thread.start()

Capturing audio...
