# Training and storing GMM models for each phoneme

In [1]:
# Required libraries to import
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
import pickle

In [2]:
# Accordingly change the mfcc_dir, TRAIN_DF, TEST_DF for the number of components and number of mfcc features

mfcc_dir = 'models/mfcc_12_64/'
TRAIN_DF = 'features/train1.hdf'
TEST_DF = 'features/test1.hdf'
ENERGY_FLAG = False # To include energy co-efficient or not
TRAIN_FLAG = False # To perform training or not

In [3]:
# Importing the hdf file got by generation from the import_timit.py file 
# If hdf file not present, generate hdf file using this command: python3 import_timit.py --timit <TIMIT_DR> --n_delta <0,1 or 2>
timit_train_df = pd.read_hdf(TRAIN_DF)

In [4]:
# Show the first 5 rows of the table
timit_train_df.head()

Unnamed: 0,features,labels
0,"[6.827545062911363, -31.824121059207688, -6.55...",sil
1,"[6.70766922870581, -25.256157140416256, -4.668...",sil
2,"[7.159189212079523, -23.604278562432256, -13.0...",sil
3,"[8.2792130032043, -22.048277126965345, -24.025...",sil
4,"[8.51029438049915, -23.544322286337103, -29.46...",sil


In [5]:
# Show the first 5 rows of the table
timit_train_df.head()

Unnamed: 0,features,labels
0,"[6.827545062911363, -31.824121059207688, -6.55...",sil
1,"[6.70766922870581, -25.256157140416256, -4.668...",sil
2,"[7.159189212079523, -23.604278562432256, -13.0...",sil
3,"[8.2792130032043, -22.048277126965345, -24.025...",sil
4,"[8.51029438049915, -23.544322286337103, -29.46...",sil


In [6]:
# Extracting features and labels from the train set
train_features = np.array(timit_train_df["features"].tolist())
train_labels = np.array(timit_train_df["labels"].tolist())

In [7]:
np.shape(train_features)

(1236543, 13)

In [8]:
# Array indices to delete

if not ENERGY_FLAG:

    rem = [0] # Change rem to [0], [0,12] or [0,12,24] for 13, 24 and 39 mfcc features respectively

    for i in rem:
        train_features = np.delete(train_features,i,1)

In [9]:
np.shape(train_features)

(1236543, 12)

In [10]:
# Shows a list of the unique labels in a sorted manner
sorted(list(set(train_labels)))

['',
 'aa',
 'ae',
 'ah',
 'aw',
 'ay',
 'b',
 'ch',
 'd',
 'dh',
 'dx',
 'eh',
 'er',
 'ey',
 'f',
 'g',
 'hh',
 'ih',
 'iy',
 'jh',
 'k',
 'l',
 'm',
 'n',
 'ng',
 'ow',
 'oy',
 'p',
 'r',
 's',
 'sh',
 'sil',
 't',
 'th',
 'uh',
 'uw',
 'v',
 'w',
 'y',
 'z']

In [11]:
# Training a gmm for each phoneme of the label_list and storing it in a directory in a pickled format

if TRAIN_FLAG:
    for index,i in enumerate(sorted(list(set(train_labels)))):
        gmm = GaussianMixture(n_components=64,covariance_type='diag')
        print('Phoneme ',index+1,' : ',i)
        gmm.fit(train_features[train_labels == i])
        if(i==''):
            pickle.dump(gmm,open(mfcc_dir+'blank.pkl','wb'))
        else:
            pickle.dump(gmm,open(mfcc_dir+i+'.pkl','wb'))

### Calculating Training Accuracy

In [12]:
# Replacing the blank phoneme label with the word 'blank'
sortlabels = sorted(list(set(train_labels)))
sortlabels[0] = 'blank'
sortlabels

['blank',
 'aa',
 'ae',
 'ah',
 'aw',
 'ay',
 'b',
 'ch',
 'd',
 'dh',
 'dx',
 'eh',
 'er',
 'ey',
 'f',
 'g',
 'hh',
 'ih',
 'iy',
 'jh',
 'k',
 'l',
 'm',
 'n',
 'ng',
 'ow',
 'oy',
 'p',
 'r',
 's',
 'sh',
 'sil',
 't',
 'th',
 'uh',
 'uw',
 'v',
 'w',
 'y',
 'z']

In [13]:
# Creating array to store all models
gmm_models=[]

# Loading and storing all 40 models into array
for i in sortlabels:
    with open(mfcc_dir+i+'.pkl','rb') as pkl_file:
        gmm_models.append(pickle.load(pkl_file))

In [14]:
len(gmm_models)

40

In [15]:
# MAP calculation for each of the 40 models. Finally an array of arrays is got

train_scores=[]

for i in range(len(gmm_models)):
    print("MAP calculation for GMM model of phoneme ",i+1,': ',sortlabels[i])
    train_scores.append(gmm_models[i].score_samples(train_features))

MAP calculation for GMM model of phoneme  1 :  blank
MAP calculation for GMM model of phoneme  2 :  aa
MAP calculation for GMM model of phoneme  3 :  ae
MAP calculation for GMM model of phoneme  4 :  ah
MAP calculation for GMM model of phoneme  5 :  aw
MAP calculation for GMM model of phoneme  6 :  ay
MAP calculation for GMM model of phoneme  7 :  b
MAP calculation for GMM model of phoneme  8 :  ch
MAP calculation for GMM model of phoneme  9 :  d
MAP calculation for GMM model of phoneme  10 :  dh
MAP calculation for GMM model of phoneme  11 :  dx
MAP calculation for GMM model of phoneme  12 :  eh
MAP calculation for GMM model of phoneme  13 :  er
MAP calculation for GMM model of phoneme  14 :  ey
MAP calculation for GMM model of phoneme  15 :  f
MAP calculation for GMM model of phoneme  16 :  g
MAP calculation for GMM model of phoneme  17 :  hh
MAP calculation for GMM model of phoneme  18 :  ih
MAP calculation for GMM model of phoneme  19 :  iy
MAP calculation for GMM model of phoneme 

In [16]:
# TRAINING ACCURACY

# Predicting labels using argmax indices
pred_labels=np.array(sorted(list(set(train_labels))))[np.argmax((np.transpose(train_scores)),axis=1)]
# Accuracy
train_acc = (np.count_nonzero(pred_labels==train_labels)/len(train_labels))*100
print("Training Accuracy: ",train_acc,"%")

Training Accuracy:  16.552275173609004 %


### END OF TESTING! CHECK OUT 'test.ipynb' FOR TESTING