In [None]:
import warnings
warnings.filterwarnings('ignore')

from os import listdir
from os.path import isfile, join
import pickle

from hmmlearn import hmm

from python_speech_features import mfcc
import scipy.io.wavfile as wav
import matplotlib.pyplot as plot
import numpy

from sklearn.model_selection import train_test_split

# Get the tagged data directories

In [None]:
ghmms = {}
data = {}
datadirs = [f for f in listdir("data/") if ('.' not in f)]
print(datadirs)

# Preprocess data to get MFCCs for each data file. Then split data into test and training set.

In [None]:
for directory in datadirs:
    curdir = "data/" + directory
    datafiles = [f for f in listdir(curdir) if isfile(join(curdir,f))]

    dataForWord = []
    for f in datafiles:
        (rate, signal) = wav.read(curdir + "/" + f)
        dataForWord.append(mfcc(signal, rate, winfunc=numpy.hamming))
    
    train, test = train_test_split(dataForWord, test_size=0.3)
    data[directory] = {'train': train, 'test': test}
    print(directory, len(datafiles), len(data[directory]['train']), len(data[directory]['test']))


# Reshape data in a way that is amenable to HMMLearn

In [None]:
flatData = {}
lengths = {}
for directory in datadirs:
    flatDataForWord = []
    runLengthsForWord = []
    for fileData in data[directory]['train']:
        runLengthsForWord.append(len(fileData))
        flatDataForWord += fileData.flatten().tolist()
            
    flatData[directory] = numpy.array(flatDataForWord).reshape(-1, 13)
    lengths[directory] = runLengthsForWord
    print(directory, flatData[directory].shape, sum(lengths[directory]))

# Train a word model with each training set that corresponds to a single word

In [None]:
for directory in datadirs:
    print(flatData[directory].shape)
    ghmm = hmm.GMMHMM(n_mix=2, n_components=2, n_iter=10)
    ghmm.fit(flatData[directory], lengths=lengths[directory])
    ghmms[directory] = ghmm

# Output trained gmmhmms as a pickle file for later prediction

pickle.dump(ghmms, open("hmmset.p", "wb"))

In [None]:
def predict(fileData):
    logOddsToKey = {}
    for key in ghmms:
        ghmm = ghmms[key]
        logOdds = ghmms[key].score_samples(fileData)[0]
        logOddsToKey[logOdds] = key
    return logOddsToKey[max(logOddsToKey.keys())]

# Label the training data

In [None]:
labeledTestData = []
for key in data:
    for fileData in data[key]['test']:
        labeledTestData.append((fileData, key))

# Predict word for training data

In [None]:
successful = 0
for test in labeledTestData:
    prediction = predict(test[0])
    if prediction == test[1]:
        successful += 1

# Score training performance

In [None]:
print('# data points:', len(labeledTestData))
print('success rate:', float(successful) / len(labeledTestData) )