In [6]:
import warnings
warnings.filterwarnings('ignore')

from os import listdir
from os.path import isfile, join
import pickle

from hmmlearn import hmm

from python_speech_features import mfcc
import scipy.io.wavfile as wav
import matplotlib.pyplot as plot
import numpy

from sklearn.model_selection import train_test_split

# Get the tagged data directories

In [2]:
ghmms = {}
data = {}
datadirs = [f for f in listdir("data/") if ('.' not in f)]
print(datadirs)

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']


# Preprocess data to get MFCCs for each data file. Then split data into test and training set.

In [3]:
for directory in datadirs:
    curdir = "data/" + directory
    datafiles = [f for f in listdir(curdir) if isfile(join(curdir,f))]

    dataForWord = []
    for f in datafiles:
        (rate, signal) = wav.read(curdir + "/" + f)
        dataForWord.append(mfcc(signal, rate, winfunc=numpy.hamming))
    
    train, test = train_test_split(dataForWord, test_size=0.3)
    data[directory] = {'train': train, 'test': test}
    print(directory, len(datafiles), len(data[directory]['train']), len(data[directory]['test']))


bed 1713 1199 514
bird 1731 1211 520
cat 1733 1213 520
dog 1746 1222 524
down 2359 1651 708
eight 2352 1646 706
five 2357 1649 708
four 2372 1660 712
go 2372 1660 712
happy 1742 1219 523
house 1750 1225 525
left 2353 1647 706
marvin 1746 1222 524
nine 2364 1654 710
no 2375 1662 713
off 2357 1649 708
on 2367 1656 711
one 2370 1659 711
right 2367 1656 711
seven 2377 1663 714
sheila 1734 1213 521
six 2369 1658 711
stop 2380 1666 714
three 2356 1649 707
tree 1733 1213 520
two 2373 1661 712
up 2375 1662 713
wow 1745 1221 524
yes 2377 1663 714
zero 2376 1663 713


# Reshape data in a way that is amenable to HMMLearn

In [4]:
flatData = {}
lengths = {}
for directory in datadirs:
    flatDataForWord = []
    runLengthsForWord = []
    for fileData in data[directory]['train']:
        runLengthsForWord.append(len(fileData))
        flatDataForWord += fileData.flatten().tolist()
            
    flatData[directory] = numpy.array(flatDataForWord).reshape(-1, 13)
    lengths[directory] = runLengthsForWord
    print(directory, flatData[directory].shape, sum(lengths[directory]))

bed (116667, 13) 116667
bird (117592, 13) 117592
cat (117684, 13) 117684
dog (118709, 13) 118709
down (161040, 13) 161040
eight (160387, 13) 160387
five (161200, 13) 161200
four (161947, 13) 161947
go (161108, 13) 161108
happy (118664, 13) 118664
house (119221, 13) 119221
left (161345, 13) 161345
marvin (119366, 13) 119366
nine (161659, 13) 161659
no (161507, 13) 161507
off (160965, 13) 160965
on (160941, 13) 160941
one (161192, 13) 161192
right (161818, 13) 161818
seven (162256, 13) 162256
sheila (118683, 13) 118683
six (162398, 13) 162398
stop (162542, 13) 162542
three (161087, 13) 161087
tree (117822, 13) 117822
two (161758, 13) 161758
up (161236, 13) 161236
wow (118358, 13) 118358
yes (162218, 13) 162218
zero (163044, 13) 163044


# Train a word model with each training set that corresponds to a single word

In [15]:
for directory in datadirs:
    print(flatData[directory].shape)
    ghmm = hmm.GMMHMM(n_mix=2, n_components=2, n_iter=10)
    ghmm.fit(flatData[directory], lengths=lengths[directory])
    ghmms[directory] = ghmm

(116667, 13)


KeyboardInterrupt: 

# Output trained gmmhmms as a pickle file for later prediction

pickle.dump(ghmms, open("hmmset.p", "wb"))

In [11]:
def predict(fileData):
    logOddsToKey = {}
    for key in ghmms:
        ghmm = ghmms[key]
        logOdds = ghmms[key].score_samples(fileData)[0]
        logOddsToKey[logOdds] = key
    return logOddsToKey[max(logOddsToKey.keys())]

# Label the training data

In [12]:
labeledTestData = []
for key in data:
    for fileData in data[key]['test']:
        labeledTestData.append((fileData, key))

# Predict word for training data

In [13]:
successful = 0
for test in labeledTestData:
    prediction = predict(test[0])
    if prediction == test[1]:
        successful += 1

# Score training performance

In [14]:
print('# data points:', len(labeledTestData))
print('success rate:', float(successful) / len(labeledTestData) )

# data points: 19429
success rate: 0.31761799372072674
