# PocketSphinx Speech Recognition

#### This notebook makes an operational speech recognition example based upon the example given at https://github.com/cmusphinx/pocketsphinx-python

In [1]:
# Notebook by Sara Collins
# Deviated from example toward end, but this was to make a string
# out of the recognized words

In [2]:
# Import statements go here
import pocketsphinx as ps
import sphinxbase
import wave

In [3]:
# Set up directory files for the different files related to the
# speech recognition model you are using with pocketsphinx. 
# Also provide the path to an audio file
hmdir = '/usr/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k'
lmdir = '/usr/share/pocketsphinx/model/lm/en_US/hub4.5000.DMP'
dictd = '/usr/share/pocketsphinx/model/lm/en_US/cmu07a.dic'
wavFile = '~/wav_sample.wav' ### Enter file path here!!!

In [4]:
# Create a new PocketSphinx configuration object
config = ps.Decoder.default_config()

In [5]:
# Update the configuration object with the model file paths above
config.set_string('-hmm', hmdir)
config.set_string('-lm',lmdir)
config.set_string('-dict',dictd)

In [14]:
# Create a new decoder (this may be a redundant step--- I kept it just to make the variable below work correctly)
decoder = ps.Decoder(config)

In [15]:
# Start the utterance in the decoder
decoder.start_utt()

In [16]:
# Open the audio file to the stream
stream = open(wavFile, 'rb')

In [17]:
# Read and process the audio's speech in chunks of 1024 samples
while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break
decoder.end_utt()

In [18]:
# Print out the individual segment words
print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])

('Best hypothesis segments: ', ['<s>', 'that(2)', "don't", 'call', '<sil>', 'it', 'took', 'over', 'it', 'on', 'what(2)', 'occurred', '<sil>', 'not', 'all', '<sil>', 'look', "you're", '<sil>', '<sil>', "won't", 'help', 'but', 'overall', '<sil>', '<sil>', '<sil>', 'get(2)', 'of', 'took', '<sil>', 'an(2)', 'average(3)', 'trip', '<sil>', '<sil>', 'and', 'it', 'runs', 'at', 'what(2)', 'it', '<sil>', 'you', "can't", 'explain', 'it', '<sil>', 'took', 'but', "you'd(2)", 'into(2)', 'but', '<sil>', "don't", 'have', 'then', 'as', '<sil>', 'that(2)', 'different(2)', 'the', 'open', '<sil>', '<sil>', '<sil>', '</s>'])


In [19]:
# See where the start and end frames were for each word
for x in decoder.seg():
    print x.word, ' ', x.prob, x.start_frame, x.end_frame

<s>   -19 1652 1692
that(2)   -1738 1693 1745
don't   -17 1746 1834
call   -18678 1835 1857
<sil>   -17 1858 1909
it   -21049 1910 1917
took   -39071 1918 1932
over   -25872 1933 1945
it   -43 1946 1997
on   -352 1998 2005
what(2)   -1785 2006 2039
occurred   -21763 2040 2081
<sil>   -6 2082 2132
not   -28744 2133 2174
all   -3752 2175 2191
<sil>   -4 2192 2209
look   -9 2210 2261
you're   -26365 2262 2283
<sil>   -275 2284 2293
<sil>   -3 2294 2330
won't   -51368 2331 2370
help   -29106 2371 2392
but   -8137 2393 2424
overall   -2053 2425 2457
<sil>   -3230 2458 2464
<sil>   -3 2465 2482
<sil>   -109 2483 2505
get(2)   -39063 2506 2531
of   -39642 2532 2541
took   -47857 2542 2566
<sil>   -33 2567 2577
an(2)   -27897 2578 2588
average(3)   -23268 2589 2616
trip   -38784 2617 2652
<sil>   -11446 2653 2658
<sil>   0 2659 2713
and   -19705 2714 2724
it   -17028 2725 2738
runs   -16633 2739 2767
at   -2 2768 2800
what(2)   -11589 2801 2824
it   -1474 2825 2843
<sil>   4 2844 2901
you   -2

In [20]:
# Output the decoder's hypotheses to a single string
decoder.hyp().hypstr

"that don't call it took over it on what occurred not all look you're won't help but overall get of took an average trip and it runs at what it you can't explain it took but you'd into but don't have then as that different the open"