In [1]:
import queue
import sounddevice as sd
from vosk import Model, KaldiRecognizer
import sys
import json

In [2]:
# list all audio devices known to your system
print("Display input/output devices")
print(sd.query_devices())

Display input/output devices
   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 Microphone Array (Intel® Smart , MME (2 in, 0 out)
   2 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  3 Speakers (2- Realtek(R) Audio), MME (0 in, 2 out)
   4 Primary Sound Capture Driver, Windows DirectSound (2 in, 0 out)
   5 Microphone Array (Intel® Smart Sound Technology (Intel® SST)), Windows DirectSound (2 in, 0 out)
   6 Primary Sound Driver, Windows DirectSound (0 in, 2 out)
   7 Speakers (2- Realtek(R) Audio), Windows DirectSound (0 in, 2 out)
   8 Speakers (2- Realtek(R) Audio), Windows WASAPI (0 in, 2 out)
   9 Microphone Array (Intel® Smart Sound Technology (Intel® SST)), Windows WASAPI (4 in, 0 out)
  10 Speakers 1 (Realtek HD Audio output with SST), Windows WDM-KS (0 in, 2 out)
  11 Speakers 2 (Realtek HD Audio output with SST), Windows WDM-KS (0 in, 2 out)
  12 PC Speaker (Realtek HD Audio output with SST), Windows WDM-KS (2 in, 0 out)
  13 Microphone (Realtek HD Audio Mic in

In [3]:
# get the samplerate - this is needed by the Kaldi recognizer
device_info = sd.query_devices(sd.default.device[0], 'input')
samplerate = int(device_info['default_samplerate'])

In [4]:
# display the default input device
print("===> Initial Default Device Number:{} Description: {}".format(sd.default.device[0], device_info))

===> Initial Default Device Number:1 Description: {'name': 'Microphone Array (Intel® Smart ', 'index': 1, 'hostapi': 0, 'max_input_channels': 2, 'max_output_channels': 0, 'default_low_input_latency': 0.09, 'default_low_output_latency': 0.09, 'default_high_input_latency': 0.18, 'default_high_output_latency': 0.18, 'default_samplerate': 44100.0}


In [5]:
# setup queue and callback function
q = queue.Queue()

def recordCallback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))

In [6]:
# build the model and recognizer objects.
print("===> Build the model and recognizer objects.  This will take a few minutes.")
model = Model(r"C:\Users\admin\Desktop\audio to text\vosk-model-small-en-us-0.15")
recognizer = KaldiRecognizer(model, samplerate)
recognizer.SetWords(False)

===> Build the model and recognizer objects.  This will take a few minutes.


In [None]:
print("===> Begin recording. Press Ctrl+C to stop the recording ")
try:
    with sd.RawInputStream(dtype='int16',
                           channels=1,

                           callback=recordCallback):
        while True:
            data = q.get()        
            if recognizer.AcceptWaveform(data):
                recognizerResult = recognizer.Result()
                # convert the recognizerResult string into a dictionary  
                resultDict = json.loads(recognizerResult)
                if not resultDict.get("text", "") == "":
                    print(recognizerResult)
                else:
                    print("no input sound")

except KeyboardInterrupt:
    print('===> Finished Recording')
except Exception as e:
    print(str(e))

===> Begin recording. Press Ctrl+C to stop the recording 
{
  "text" : "hello i'm here to say i'm working in sub exciting"
}
{
  "text" : "ah my"
}
{
  "text" : "hello hassan hello"
}
{
  "text" : "how are you"
}
{
  "text" : "i'm fine and do"
}
{
  "text" : "are you looking for an opportunity up there will be sick they will engineer"
}
{
  "text" : "yeah it's really good so good"
}
{
  "text" : "oh okay"
}
{
  "text" : "matchmaking is done"
}
{
  "text" : "my checking is done know mad suki"
}
{
  "text" : "oh okay fine"
}
no input sound
