In [136]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm
import noisereduce as nr
import sounddevice as sd
import soundfile as sf
import pickle

In [137]:
# def noise_cancel(filename, classs, dirr):
#     data, fs = librosa.load(os.path.join(dirr,filename))
#     reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=data)
#     sf.write('./normal_word/' + classs + '/' + filename, data=reduced_noise, samplerate=fs)
# def normal_data():
#     class_names = ["nguoi", "toi", "khong", "mot" , "test_khong", "benh_nhan" , "test_toi" , "test_mot", "test_nguoi", "test_benh_nhan" ]
#     for cname in class_names:
#         files = os.listdir(os.path.join("word", cname))
#         dirfile = os.path.join("word", cname)
#         for f in files:
#             if f.endswith(".wav"):
#                 noise_cancel(f, cname, dirfile)
# normal_data()

In [138]:

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix

def get_class_data(data_dir):
    files = os.listdir(data_dir)
    mfcc = [get_mfcc(os.path.join(data_dir,f)) for f in files if f.endswith(".wav")]
    print('data_dir: ', data_dir)
    print(f'mfcc.shape: {np.array(mfcc).shape}')
    return mfcc

def clustering(X, n_clusters=20):
    kmeans = KMeans(n_clusters=n_clusters, n_init=100, random_state=0, verbose=0)
    kmeans.fit(X)
    print("centers", kmeans.cluster_centers_.shape)
    return kmeans  


In [139]:

class_names = ["nguoi", "toi", "khong", "mot", "benh_nhan" , "test_khong" , "test_toi" , "test_mot", "test_nguoi", "test_benh_nhan" ]
dataset = {}
for cname in class_names:
    print(f"Load {cname} dataset")
    dataset[cname] = get_class_data(os.path.join("normal_word", cname))


Load nguoi dataset
data_dir:  normal_word/nguoi
mfcc.shape: (76,)
Load toi dataset
data_dir:  normal_word/toi
mfcc.shape: (76,)
Load khong dataset
data_dir:  normal_word/khong
mfcc.shape: (76,)
Load mot dataset
data_dir:  normal_word/mot
mfcc.shape: (76,)
Load benh_nhan dataset
data_dir:  normal_word/benh_nhan
mfcc.shape: (75,)
Load test_khong dataset
data_dir:  normal_word/test_khong
mfcc.shape: (25,)
Load test_toi dataset
data_dir:  normal_word/test_toi
mfcc.shape: (25,)
Load test_mot dataset
data_dir:  normal_word/test_mot
mfcc.shape: (25,)
Load test_nguoi dataset
data_dir:  normal_word/test_nguoi
mfcc.shape: (25,)
Load test_benh_nhan dataset
data_dir:  normal_word/test_benh_nhan
mfcc.shape: (25,)


In [140]:
# # Get all vectors in the datasets
# all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
# print("vectors", all_vectors.shape)

In [141]:
# # Run K-Means algorithm to get clusters
# kmeans = clustering(all_vectors)
# pickle.dump(kmeans, open('./model/kmean.pk','wb'))
# print("centers", kmeans.cluster_centers_.shape)


In [142]:
for key, val in dataset.items():
    print(key,'\n', np.array(val).shape)


nguoi 
 (76,)
toi 
 (76,)
khong 
 (76,)
mot 
 (76,)
benh_nhan 
 (75,)
test_khong 
 (25,)
test_toi 
 (25,)
test_mot 
 (25,)
test_nguoi 
 (25,)
test_benh_nhan 
 (25,)


In [143]:
models = {}
dataset_kmean = {}

In [144]:
cname = 'nguoi'
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
# dataset_kmean[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])


hmm = hmmlearn.hmm.GMMHMM(
            n_components=9, 
            n_mix = 4, random_state=10, n_iter=500, verbose=True,
            params='mctw', init_params='mct',
        )
hmm.startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_prior=np.array([
    [0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    
])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X)
    models[cname] = hmm

training class nguoi
(1875, 36) [16, 17, 17, 28, 27, 22, 23, 20, 21, 25, 12, 13, 26, 22, 44, 22, 17, 14, 21, 22, 27, 30, 22, 20, 21, 18, 29, 37, 14, 23, 23, 19, 23, 18, 37, 15, 24, 23, 17, 31, 35, 40, 20, 30, 40, 20, 44, 28, 19, 27, 22, 19, 16, 23, 25, 20, 42, 19, 23, 29, 30, 21, 19, 28, 31, 21, 19, 27, 28, 22, 20, 33, 31, 30, 24, 50] 76


         1     -201478.4377             +nan
         2     -189238.6448      +12239.7929
         3     -187173.7169       +2064.9279
         4     -186489.8144        +683.9025
         5     -186062.2323        +427.5821
         6     -185805.7844        +256.4479
         7     -185627.3228        +178.4616
         8     -185554.8267         +72.4961
         9     -185515.8658         +38.9609
        10     -185480.3062         +35.5596
        11     -185408.0622         +72.2441
        12     -185317.4772         +90.5850
        13     -185227.0826         +90.3945
        14     -185197.9124         +29.1702
        15     -185189.8906          +8.0218
        16     -185128.0267         +61.8639
        17     -185081.0501         +46.9766
        18     -185050.4034         +30.6467
        19     -185014.2379         +36.1655
        20     -184998.5039         +15.7340
        21     -184994.4710          +4.0329
        22     -184986.2132          +8.2578
        23

In [145]:
cname = 'toi'
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
# dataset_kmean[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])


hmm = hmmlearn.hmm.GMMHMM(
            n_components=9, 
            n_mix = 4, random_state=10, n_iter=500, verbose=True,
            params='mctw', init_params='mct',
        )
hmm.startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_prior=np.array([
    [0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    
])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X)
    models[cname] = hmm

training class toi
(1736, 36) [17, 16, 18, 24, 10, 21, 14, 17, 28, 25, 16, 16, 25, 33, 15, 24, 37, 12, 27, 23, 27, 17, 16, 35, 20, 15, 14, 38, 32, 37, 33, 43, 34, 17, 29, 19, 20, 20, 17, 23, 15, 22, 17, 15, 42, 33, 27, 27, 26, 32, 14, 18, 32, 16, 14, 15, 30, 18, 23, 41, 13, 16, 17, 20, 14, 12, 47, 25, 21, 27, 15, 24, 14, 39, 13, 18] 76


         1     -185143.3738             +nan
         2     -172950.0340      +12193.3398
         3     -170370.3253       +2579.7087
         4     -169504.0803        +866.2450
         5     -169187.0177        +317.0626
         6     -168973.9828        +213.0349
         7     -168770.5464        +203.4364
         8     -168665.0101        +105.5363
         9     -168535.6258        +129.3843
        10     -168476.1751         +59.4507
        11     -168416.1661         +60.0090
        12     -168338.4745         +77.6916
        13     -168289.3062         +49.1683
        14     -168264.6685         +24.6377
        15     -168227.4065         +37.2620
        16     -168216.7665         +10.6399
        17     -168199.8049         +16.9616
        18     -168180.7198         +19.0852
        19     -168158.7554         +21.9644
        20     -168151.5966          +7.1588
        21     -168144.3903          +7.2062
        22     -168135.5569          +8.8334
        23

In [146]:
np.set_printoptions(precision=2, suppress=True)
print(models['toi'].transmat_)

[[0.89 0.   0.   0.   0.   0.06 0.01 0.   0.04]
 [0.   0.73 0.   0.01 0.   0.   0.   0.25 0.  ]
 [0.   0.01 0.78 0.   0.1  0.1  0.   0.   0.  ]
 [0.   0.   0.   0.85 0.   0.   0.   0.15 0.  ]
 [0.04 0.01 0.02 0.   0.82 0.06 0.03 0.   0.02]
 [0.03 0.   0.   0.01 0.01 0.79 0.06 0.   0.1 ]
 [0.   0.24 0.   0.01 0.   0.   0.68 0.   0.07]
 [0.   0.   0.24 0.   0.08 0.03 0.   0.66 0.  ]
 [0.   0.04 0.   0.02 0.01 0.   0.05 0.11 0.77]]


In [147]:
cname = 'khong'
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
# dataset_kmean[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])


hmm = hmmlearn.hmm.GMMHMM(
            n_components=9, 
            n_mix = 4, random_state=10, n_iter=500, verbose=True,
            params='mctw', init_params='mct',
        )
hmm.startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_prior=np.array([
    [0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    
])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X)
    models[cname] = hmm

training class khong
(1898, 36) [27, 17, 27, 27, 33, 21, 20, 23, 29, 45, 25, 26, 27, 33, 35, 20, 30, 20, 28, 16, 22, 21, 15, 28, 24, 17, 22, 24, 18, 26, 22, 23, 25, 33, 25, 14, 33, 21, 20, 27, 23, 19, 20, 22, 25, 19, 30, 31, 18, 20, 21, 19, 20, 28, 25, 17, 26, 20, 38, 40, 25, 27, 25, 34, 34, 20, 24, 23, 23, 28, 27, 25, 26, 36, 32, 19] 76


         1     -207646.5482             +nan
         2     -194633.0254      +13013.5228
         3     -191892.5687       +2740.4567
         4     -149395.5483      +42497.0205
         5     -149078.8052        +316.7431
         6     -148828.9271        +249.8781
         7     -148694.3888        +134.5383
         8     -148604.5392         +89.8496
         9     -148543.8316         +60.7076
        10     -148460.8758         +82.9558
        11     -148409.2817         +51.5942
        12     -148365.1511         +44.1305
        13     -148336.1769         +28.9742
        14     -148329.9652          +6.2118
        15     -148303.3137         +26.6515
        16     -148297.2970          +6.0167
        17     -148285.0564         +12.2406
        18     -148275.7951          +9.2613
        19     -148269.3890          +6.4061
        20     -148262.8961          +6.4929
        21     -148244.7942         +18.1019
        22     -148231.0096         +13.7846
        23

In [148]:
cname = 'mot'
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
# dataset_kmean[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])


hmm = hmmlearn.hmm.GMMHMM(
            n_components=9, 
            n_mix = 4, random_state=10, n_iter=500, verbose=True,
            params='mctw', init_params='mct',
        )
hmm.startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_prior=np.array([
    [0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    
])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X)
    models[cname] = hmm

training class mot
(1472, 36) [22, 23, 18, 11, 24, 20, 18, 18, 18, 26, 24, 16, 36, 16, 25, 19, 21, 17, 20, 20, 16, 20, 27, 19, 23, 23, 20, 12, 14, 18, 10, 18, 22, 21, 36, 16, 23, 23, 16, 18, 22, 26, 18, 12, 16, 19, 15, 15, 12, 18, 11, 20, 24, 14, 18, 18, 20, 14, 23, 22, 15, 20, 23, 17, 17, 15, 16, 21, 23, 20, 21, 21, 24, 16, 23, 16] 76


         1     -157940.8816             +nan
         2     -148182.2363       +9758.6453
         3     -144463.2704       +3718.9659
         4      -68790.9601      +75672.3103
         5      -60405.1009       +8385.8592
         6      -60173.7229        +231.3780
         7      -60066.1990        +107.5239
         8      -59967.3704         +98.8286
         9      -59872.8753         +94.4951
        10      -59780.5727         +92.3025
        11      -59689.6452         +90.9276
        12      -59655.8599         +33.7852
        13      -59630.8882         +24.9718
        14      -59621.4027          +9.4855
        15      -59607.7830         +13.6197
        16      -59599.0683          +8.7147
        17      -59588.2026         +10.8657
        18      -59577.8853         +10.3172
        19      -59560.5156         +17.3697
        20      -59522.3092         +38.2064
        21      -59477.7475         +44.5618
        22      -59445.2405         +32.5069
        23

In [149]:
cname = 'benh_nhan'


hmm = hmmlearn.hmm.GMMHMM(
            n_components=18, 
            n_mix = 4, random_state=10, n_iter=500, verbose=True,
            params='mctw', init_params='mct',
        )
hmm.startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_prior=np.array([ 
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X)
    models[cname] = hmm

training class benh_nhan
(3422, 36) [45, 43, 33, 47, 54, 37, 29, 47, 42, 67, 55, 57, 48, 39, 46, 34, 45, 44, 39, 38, 41, 46, 28, 51, 70, 42, 71, 48, 42, 59, 39, 34, 35, 67, 58, 39, 39, 46, 85, 30, 55, 34, 42, 44, 50, 26, 58, 44, 33, 44, 43, 41, 37, 37, 40, 48, 56, 35, 37, 36, 42, 48, 69, 52, 39, 25, 36, 51, 56, 74, 32, 63, 55, 54, 27] 75


         1     -383596.8479             +nan
         2     -358896.9886      +24699.8593
         3     -351788.1907       +7108.7980
         4     -349403.2849       +2384.9058
         5     -348617.4875        +785.7974
         6     -348196.9741        +420.5134
         7     -347796.4782        +400.4959
         8     -347414.5592        +381.9190
         9     -347191.5819        +222.9773
        10     -346940.7880        +250.7939
        11     -346838.0236        +102.7644
        12     -346770.1094         +67.9141
        13     -346691.6250         +78.4844
        14     -346641.4180         +50.2070
        15     -346527.7857        +113.6323
        16     -346455.2738         +72.5120
        17     -346341.0557        +114.2180
        18     -346150.2372        +190.8185
        19     -346040.8864        +109.3508
        20     -345963.6874         +77.1991
        21     -345911.0311         +52.6563
        22     -345862.6165         +48.4146
        23

In [150]:
pickle.dump(models, open('./model/models.pk','wb'))

In [151]:
# dataset["test_nguoi"] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_nguoi']])
# print(dataset["test_nguoi"])
# dataset['test_toi'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_toi']])
# dataset['test_khong'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_khong']])
# dataset['test_mot'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_mot']])
# dataset['test_benh_nhan'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_benh_nhan']])

In [153]:
print("Testing")
mapping = ["nguoi", "toi", "khong", "mot", "benh_nhan"]
class_names = ["test_nguoi", "test_toi", "test_khong", "test_mot", "test_benh_nhan"]

for true_cname in class_names:
    print(true_cname)
    score = []
    count = 0
    correct = 0
    for i in dataset[true_cname]:
        score = [model.score(i, [len(i)]) for cname, model in models.items() if cname[:4] != 'test']
        res = mapping[score.index(max(score))] 
        
        if res == true_cname[5:]:
            correct += 1
        count += 1 
    print('accuracy {} / {}'.format(correct, count))

Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance


Testing
test_nguoi


Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate

accuracy 25 / 25
test_toi


Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate

accuracy 25 / 25
test_khong


Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate

accuracy 25 / 25
test_mot


Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate

accuracy 18 / 25
test_benh_nhan


Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate mixture covariance
Degenerate

accuracy 25 / 25


In [None]:
print(100*correct/count)