In [1]:
import sounddevice as sd

# List all available audio devices in human-readable format
devices = sd.query_devices()

# Print details of each audio device
print(devices)

   0 HD-Audio Generic: DELL U2417H (hw:0,3), ALSA (0 in, 2 out)
   1 HD-Audio Generic: DELL P2418HZm (hw:0,7), ALSA (0 in, 2 out)
   2 HD-Audio Generic: DELL U2417H (hw:0,8), ALSA (0 in, 2 out)
   3 HD-Audio Generic: ALC257 Analog (hw:1,0), ALSA (2 in, 2 out)
   4 acp: - (hw:2,0), ALSA (2 in, 0 out)
   5 ThinkPad USB-C Dock Gen2 USB Au: Audio (hw:3,0), ALSA (1 in, 2 out)
   6 JOUNIVO JV601: USB Audio (hw:4,0), ALSA (2 in, 0 out)
   7 P2418HZm: USB Audio (hw:5,0), ALSA (2 in, 2 out)
   8 hdmi, ALSA (0 in, 2 out)
   9 pulse, ALSA (32 in, 32 out)
* 10 default, ALSA (32 in, 32 out)


In [2]:
import sounddevice as sd

# Device ID (use the ID of your specific audio device)
device_id = 6

# Query input device information
input_device_info = sd.query_devices(device_id, 'input')
print(input_device_info)

# Get the default sample rate for the input and output devices
input_default_samplerate = input_device_info['default_samplerate']

# Print the default sample rates
print(f"Default sample rate for input device: {input_default_samplerate} Hz")

{'name': 'JOUNIVO JV601: USB Audio (hw:4,0)', 'index': 6, 'hostapi': 0, 'max_input_channels': 2, 'max_output_channels': 0, 'default_low_input_latency': 0.008684807256235827, 'default_low_output_latency': -1.0, 'default_high_input_latency': 0.034829931972789115, 'default_high_output_latency': -1.0, 'default_samplerate': 44100.0}
Default sample rate for input device: 44100.0 Hz


In [None]:
import sounddevice as sd
import numpy as np
import threading
import queue
import openai
import keyring
import os
import io
import wave
from datetime import datetime
import tempfile
from datetime import datetime
import time

# Define audio recording parameters
recording_duration = 5  # Duration of recording for noise baseline in seconds
chunk_duration = 1  # Duration of each chunk in seconds
chunk_samples = int(input_default_samplerate * chunk_duration)  # Number of samples in each chunk
threshold_multiplier = 1.5  # Multiplier for noise threshold

# Define a function to save the numpy array as a WAV file in memory
def save_wav_in_memory(audio_data):
    byte_io = io.BytesIO()
    with wave.open(byte_io, 'wb') as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(input_default_samplerate)
        wav_file.writeframes(audio_data.astype(np.int16).tobytes())
    byte_io.seek(0)
    return byte_io.read()

# Calculate root-mean-square (RMS) energy of a chunk
def calculate_rms(chunk):
    return np.sqrt(np.mean(np.square(chunk)))

# Simple Voice Activity Detection (VAD)
def is_voice_active(chunk, threshold):
    rms = calculate_rms(chunk)
    return (rms,rms > threshold)

# Define a function for recording audio and transcribing chunks
def record_and_transcribe(stop_event, threshold):
    active_chunk = np.array([])
    recording_start = time.time()
    is_talking = False
    gain_factor = 2.0  # This will amplify the audio by a factor of 2

    with sd.InputStream(samplerate=input_default_samplerate,device=device_id) as stream:
        while not stop_event.is_set():
            chunk, _ = stream.read(chunk_samples)  # Read chunk from the stream
            chunk = chunk.flatten()  # Flatten the chunk to 1 dimension
            chunk = chunk*gain_factor
            rms,voice_active = is_voice_active(chunk, threshold)
            if voice_active:
                if not is_talking:
                    is_talking = True
                    recording_start = time.time()
                    print("Started Talking",rms,threshold)
                else:
                    print("Is Talking",rms,threshold,time.time()-recording_start)
                # If active voice detected, append chunk to active_chunk
                active_chunk = np.concatenate((active_chunk, chunk))
            elif len(active_chunk) > 0:
                print("Stopped Talking",rms,threshold,time.time()-recording_start)
                
                
                # Calculate the minimum value of the array
                array_min = np.min(active_chunk)

                # Calculate the maximum value of the array
                array_max = np.max(active_chunk)

                # Calculate the mean value of the array
                array_mean = np.mean(active_chunk)

                # Print the results
                print(f"Minimum: {array_min}")
                print(f"Maximum: {array_max}")
                print(f"Mean: {array_mean}")
                is_talking = False
                # If pause detected, transcribe the active_chunk
                audio_bytes = save_wav_in_memory(active_chunk)
                # Save the audio to a temporary WAV file
                # Generate a dated filename for the temporary audio file
                datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                temp_filename = f"temp_audio_{datetime_str}.wav"
                
                with open(temp_filename, "wb") as f:
                    f.write(audio_bytes)
                print('wav:',temp_filename)
                # Transcribe the audio using OpenAI's Whisper ASR API
                file = open(temp_filename, "rb")
                file_descriptor = file.fileno()
    
                # Use os.fstat to get file information
                file_stat = os.fstat(file_descriptor)

                # Get the file size (length) in bytes from the stat_result object
                file_length = file_stat.st_size

                # Print the file length
                print(f"File length: {file_length} bytes")
                transcription = openai.Audio.transcribe("whisper-1", file)
                # Print transcribed text to console
                print(transcription)
                active_chunk = np.array([])


# OpenAI API key
openai.api_key = keyring.get_password("system", "openai_key")

# Create an event to signal recording thread to stop
stop_event = threading.Event()

# Set an initial noise threshold
noise_threshold = 0.01
# Start recording and transcribing on a separate thread
recording_thread = threading.Thread(target=record_and_transcribe, args=(stop_event, noise_threshold))
recording_thread.start()

# Prompt user to press Enter to stop recording
input("Recording started. Press Enter to stop recording...\n")

# Signal the recording thread to stop
stop_event.set()
recording_thread.join()


print("Recording and transcription complete.")

Started Talking 0.6483657 0.01
Is Talking 0.11556666 0.01 1.0024504661560059
Is Talking 0.06667807 0.01 2.0031418800354004
Is Talking 0.08022462 0.01 3.0131828784942627
Is Talking 0.08234954 0.01 3.997150421142578
Is Talking 0.07159886 0.01 5.007352828979492
Is Talking 0.05792681 0.01 6.000417470932007
Is Talking 0.011709888 0.01 7.001487493515015
Is Talking 0.011203566 0.01 8.011581659317017
Is Talking 0.10681094 0.01 9.004578113555908
Stopped Talking 0.0019860512 0.01 10.005640268325806
Minimum: -1.99951171875
Maximum: 1.99945068359375
Mean: 0.026057087524137258
wav: temp_audio_20230425_214552.wav
File length: 1764044 bytes
{
  "text": "You"
}
Started Talking 0.6458991 0.01
Is Talking 0.18659212 0.01 0.9844338893890381
Is Talking 0.20531994 0.01 1.9943878650665283
Is Talking 0.13358352 0.01 2.98770809173584
Is Talking 0.116432235 0.01 3.9883975982666016
Is Talking 0.01042779 0.01 4.998465061187744
Is Talking 0.10524774 0.01 5.991519927978516
Stopped Talking 0.0 0.01 6.992452621459961

Exception in thread Thread-5 (record_and_transcribe):
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 450, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 445, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib/python3.10/http/client.py", line 1374, in getresponse
    response.begin()
  File "/usr/lib/python3.10/http/client.py", line 318, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python3.10/http/client.py", line 279, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib/python3.10/ssl.py", line 1274, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib/python3.10/ssl.py", line 1130, in read
    return self._sslobj

In [None]:
# Signal the recording thread to stop
stop_event.set()
recording_thread.join()
