In [1]:
import pandas as pd
import numpy as np
from sklearn import mixture
from sklearn.externals import joblib
import pickle
import gzip

In [2]:
df = pd.read_hdf("./features/mfcc/train.hdf")

features = np.array(df["features"].tolist())
labels = np.array(df["labels"].tolist())

In [3]:
features.shape

(1236543, 39)

In [4]:
phoneme = set(labels)
len(phoneme)

40

In [5]:
#Step 6 (a)(i)
#GMM for MFCC with energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,:13]
    #print(df_ph.shape)
    Y = [ph] * len(df_ph)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC/064/'+ph+'.pkl', 'wb'))
    

In [6]:
#Step 6 (a)(ii)
#GMM for MFCC without energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,1:13]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/064/'+ph+'.pkl', 'wb'))
    

In [7]:
#Step 6 (b)(i)
#GMM for MFCC_delta with energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,13:26]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_delta/064/'+ph+'.pkl', 'wb'))
    

In [8]:
#Step 6 (b)(ii)
#GMM for MFCC_delta without energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,14:26]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_delta_wo_energy_coeff/064/'+ph+'.pkl', 'wb'))
    

In [9]:
#Step 6 (c)(i)
#GMM for MFCC_delta_delta with energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,26:]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_delta_delta/064/'+ph+'.pkl', 'wb'))
    

In [10]:
#Step 6 (c)(ii)
#GMM for MFCC_delta_delta without energy coeff with 64 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,27:]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=64, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_delta_delta_wo_energy_coeff/064/'+ph+'.pkl', 'wb'))
    

In [11]:
#Step 7
#GMM for MFCC without energy coeff with 2, 4, 8, 16, 32, 128, 256 mixtures for all phonemes
for ph in phoneme:
    df_ph = np.asarray(df.loc[df["labels"] == ph]["features"].tolist())[:,1:13]
    #print(df_ph.shape)
    gmm = mixture.GaussianMixture(n_components=2, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/002/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=4, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/004/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=8, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/008/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=16, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/016/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=32, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/032/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=128, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/128/'+ph+'.pkl', 'wb'))
    
    gmm = mixture.GaussianMixture(n_components=256, covariance_type='diag', max_iter=75 ).fit(df_ph, ph)
    pickle.dump(gmm, open('./models/MFCC_wo_energy_coeff/256/'+ph+'.pkl', 'wb')) 