# DEEE725 Speech Signal Processing Lab
### 2023 Spring, Kyungpook National University 
### Instructor: Gil-Jin Jang

# Project 1 Isolated digit recognition in noisy environments

- Assigned: 2023/04/21
- Due: 2023/05/04
- Required dataset: 
    1. [training data](lab05.pdf)
    1. [validation data](lab05.md)
    1. [test data](lab05.md)

----
# import packages, define analysis parameters and draw parameters, audio file preparation, etc.

In [1]:
# import necessary pacakages
# strange issue: keep the import order to prevent matplotlib error
#  import matplotlib -> librosa -> pyplot -> librosa.display
import sys
import numpy as np
import matplotlib
import librosa
from matplotlib import pyplot as plt
import librosa.display

#from scipy.io import wavfile
from scipy import signal
from scipy.fft import fftshift

# display wav files
import IPython #라이브러리 가져오기

오디오 파일들의 경로 정의

In [2]:
# add '/' if path is not a null string : 경로 파일 만드는거
def addpath(path, file):
    if len(path) == 0: 
        return file
    else:
        return path + '/' + file

신호 분석과 스펙트럼을 그리기 위한 다음의 parameter 들을 정의한다.
입력 파일의 sampling frequency 를 이용하여 shift size 를 sample 수로 정의하기 위해 사용된다.
- `Ts`: shift length in seconds, default 0.01 sec = 10 ms. 
- `Tf`: frame length in seconds, default 0.02 sec = 20 ms

In [3]:
# parameters for signal analysis
# Fs = 16000  native sampling frequency (wav file 에 정의된 것) 을 사용하면 필요 없음
Ts = 0.01   # 10 ms shift size
Tf = 0.02   # 20 ms frame size 타임 샘플링

spectrum 을 그리기 위한 parameters.
- `cmap_plot`: colormap. default value is `pyplot.cm.bone_r` (최소값 흰색, 최대값 검은색 의 gray scale) 

In [4]:
# parameters for drawing
#cmap_plot = plt.cm.bone # default colormap for spectrogram, gray
cmap_plot = plt.cm.bone_r # default colormap for spectrogram, gray, reversed
#cmap_plot = plt.cm.plasma 
#cmap_plot = plt.cm.inferno
#FIG_SIZE = (15,10)   # obsolete
FIG_SIZE = (8,3) #Figure 사이즈

---
### 이전 lab 들에서 정의한 함수들

In [5]:
# draw spectrogram
from gjdrawspectrogram3 import drawspectrogram3

# linear phase FIR filter design from magnitudes of the frequency components
from gjfiroverlapadd import getLPHFIRFFT

# trapezoidal overlap add for FIR filtering
from gjfiroverlapadd import firoverlapadd

# save audio in wav format
import gjwavfile as wav

---
### load speech and noise

오디오 파일이 16 kHz, mono 인지 확인 

In [6]:
x = np.zeros(10)
print(len(x), x.ndim)
x = np.zeros((10,2))
print(len(x), x.ndim) #zero로 채워주는 배열 생성 (쓰지는 않는다)

10 1
10 2


In [7]:
def check_audio_file(file, defFs, checkMono):
    signal, Fs = librosa.load(file, sr=None, mono=False)
    if defFs != Fs:
        print('sampling rate mismatch, %d != %d for file %s'%(defFs, Fs, file))
        return False
    elif checkMono == True:
        if signal.ndim != 1:
            print('not mono file %s, shape='%(file), signal.shape)
            return False
        return True
    elif size(signal) <= 0:
        print('wrong audio file %s, shape='%(file), signal.shape)
    else:
        return True

def convert_audio_file(file, forceFs, forceMono):
    signal, Fs = librosa.load(file, sr=None, mono=False)
    changed = False
    if forceFs != Fs:
        print('sampling rate mismatch, %d != %d for file %s'%(forceFs, Fs, file))
        signal, Fs = librosa.load(file, sr=forceFs, mono=False)
        changed = True
    elif forceMono == True:
        if signal.ndim != 1:
            print('not mono file %s, shape='%(file), signal.shape)
            signal, Fs = librosa.load(file, sr=forceFs, mono=True)
            changed = True
    elif size(signal) <= 0:
        print('wrong audio file %s, shape='%(file), signal.shape)
        return False
    if changed == True:
        wav.writewav(file, Fs, signal, maxval=1.0)
        print('updating', file)
    return changed #오디오파일이 제대로 된건지, 이상한 부분은 없는지 check

In [8]:
trainroot = 'segmented-train'
'''
labels_all = {'11jeonghy', 
                'Dandyst', 
                'InkooJeon',
                'YouYeNa',
                'chlee',
                'deokkyukwon',
                'do',
                'kyeong',
                'ohjihyeon',
                'son',
               }
''' 
labels_train = {'11jeonghy', 
                'Dandyst', 
                'InkooJeon',
                'shin3875',
                'YouYeNa',
                'son',
               } #

# check
Fs = 16000 
for subname in labels_train:
    num_files = 0
    num_false_files = 0
    for w in range(10):
        for trial in range(10):
            basename = '%d/kdigits%d-%d.wav'%(w,trial,w)
            file = addpath(trainroot, addpath(subname, basename)) #특정 파일을 가져오는 명령어
            num_files += 1
            if check_audio_file(file, Fs, True) == False:
                num_false_files += 1 #파일이 맞는지 check
    print('%s: false %d / %d\n'%(subname, num_false_files, num_files))

InkooJeon: false 0 / 100

Dandyst: false 0 / 100

11jeonghy: false 0 / 100

son: false 0 / 100

YouYeNa: false 0 / 100

shin3875: false 0 / 100



In [9]:
valroot = 'segmented-val'
valclean = addpath(valroot, 'org')
labels_val = {
                'chlee',
                'do',
                'kyeong',
               }

# check
Fs = 16000
for subname in labels_val:
    num_files = 0
    num_false_files = 0
    for w in range(10):
        for trial in range(10):
            basename = '%d/kdigits%d-%d.wav'%(w,trial,w)
            file = addpath(valclean, addpath(subname, basename))
            num_files += 1
            if check_audio_file(file, Fs, True) == False:
                num_false_files += 1
    print('%s: false %d / %d\n'%(subname, num_false_files, num_files)) #Validation 파일도 확인

chlee: false 0 / 100

kyeong: false 0 / 100

do: false 0 / 100



---
### HMM training and test 함수 정의

In [10]:
import numpy as np
import matplotlib.pyplot as plt
#from scikits.talkbox.features import mfcc
#librosa.feature.mfcc(*, y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)[source]
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp
from time import time

warnings.filterwarnings("ignore")

############################################################################################## 
# extract MFCC features
def extmfcc(file):
    samplerate, d = wavfile.read(file)
    #features.append(mfcc(d, nwin=int(samplerate * 0.03), fs=samplerate, nceps= 6)[0])
    x = np.float32(d)
    hop=samplerate//100
    mc = mfcc(y=x, sr=samplerate, n_mfcc=num_mfcc, hop_length=hop, win_length=hop*2)
    return np.transpose(mc, (1,0))

def initByBakis(inumstates, ibakisLevel):
    startprobPrior = np.zeros(inumstates)
    startprobPrior[0: ibakisLevel - 1] = 1/float((ibakisLevel - 1))
    transmatPrior = getTransmatPrior(inumstates, ibakisLevel)
    return startprobPrior, transmatPrior

def getTransmatPrior(inumstates, ibakisLevel):
    transmatPrior = (1 / float(ibakisLevel)) * np.eye(inumstates)

    for i in range(inumstates - (ibakisLevel - 1)):
        for j in range(ibakisLevel - 1):
            transmatPrior[i, i + j + 1] = 1. / ibakisLevel

    for i in range(inumstates - ibakisLevel + 1, inumstates):
        for j in range(inumstates - i - j):
            transmatPrior[i, i + j] = 1. / (inumstates - i)

    return transmatPrior


############################################################################################## 
# hyperparameters - CHANGE THEM TO IMPROVE PERFORMANCE
# 1. number of MFCC (feature dimension)
num_mfcc = 6
#num_mfcc = 10
#num_mfcc = 13
# 2. Parameters needed to train GMMHMM
m_num_of_HMMStates = 3  # number of states
m_num_of_mixtures = 2  # number of mixtures for each hidden state
m_covarianceType = 'diag'  # covariance type
m_n_iter = 10  # number of iterations
m_bakisLevel = 2
m_startprobPrior, m_transmatPrior = initByBakis(m_num_of_HMMStates,m_bakisLevel)
print("StartProbPrior="); print(m_startprobPrior)
print("TransMatPrior="); print(m_transmatPrior)


############################################################################################## 
# acoustic model definition
class SpeechModel:
    def __init__(self,Class,label):
        self.traindata = np.zeros((0,num_mfcc))
        self.Class = Class
        self.label = label
        self.model  = hmm.GMMHMM(n_components = m_num_of_HMMStates, n_mix = m_num_of_mixtures, \
                transmat_prior = m_transmatPrior, startprob_prior = m_startprobPrior, \
                covariance_type = m_covarianceType, n_iter = m_n_iter)

##################################################################################
# folder structure:
#  ${rootpath} / ${speaker_name} / m:0-9 / ${tag}[t:0-${numtrials}]-[m:0-9]
#    m:0-9 model number
#    t:0-{numtrials} trial number
#  example: train_digits('segmented-train', {'gjang', 'do', 'son'}, 'kdigis', 10) 
#           will train with
#    segmented-train/gjang/0/kdigits0-0.wav
#    segmented-train/gjang/0/kdigits1-0.wav
#    ...
#    segmented-train/son/9/kdigits8-9.wav
#    segmented-train/son/9/kdigits9-9.wav
##################################################################################
def train_digits(rootpath, speakers, tag, num_trials=10):    
    ############################################################################################## 
    # 1. find files
    #    for user "gjang", digit 2, recording trial 0 (1st)
    #    "segmented/gjang/2/kdigits0-2.wav"
    # 2. extract MFCC features for training and testing
    #    for each digit, indexes 4 and 9 for test, and the rest for training

    #fpaths = []
    #labels = []
    spoken = []
    m_trainingsetfeatures = []
    m_trainingsetlabels = []

    count = 0
    for username in speakers:
        apath2 = addpath(rootpath, username)    # example: segmented/gjang
        for ii in range(10):   #dnum in os.listdir(apath2):
            dnum = str(ii)
            apath3 = addpath(apath2, dnum)     # example: segmented/gjang/2
            if dnum not in spoken:
                spoken.append(dnum)
            for trial in range(num_trials):
                file = addpath(apath3,"{}{}-{}.wav".format(tag,trial,dnum))      # segmented/gjang/2/kdigits0-2.wav
                mc = extmfcc(file)

                # display file names for the first 20 files only
                count += 1
                if count <= 20:
                    print(file, dnum, end=' '); print(mc.shape, end=' ')
                elif count == 21:
                    print('...'); print('')

                m_trainingsetfeatures.append(mc)
                m_trainingsetlabels.append(dnum)

    print('Words spoken:', spoken)
    #print("number of labels and features = %d, %d" % ( len(labels), len(features) ))
    #print("feature shape = ", end='')
    #print(features[0].shape)

    ############################################################################################## 
    ntrain = len(m_trainingsetlabels)

    print("[training] number of labels and features = %d, %d" % 
            ( len(m_trainingsetlabels), len(m_trainingsetfeatures)) )
    print ('Loading data completed')

    ############################################################################################## 
    # model initialization
    gmmhmmindexdict = {}
    index = 0
    for word in spoken:
        gmmhmmindexdict[word] = index
        index = index +1

    ############################################################################################## 
    # training GMMHMM Models 
    start = time()

    speechmodels = [None] * len(spoken)
    for key in gmmhmmindexdict:
        speechmodels[gmmhmmindexdict[key]] = SpeechModel(gmmhmmindexdict[key],key)

    for i in range(0,len(m_trainingsetfeatures)):
         for j in range(0,len(speechmodels)):
             if int(speechmodels[j].Class) == int(gmmhmmindexdict[m_trainingsetlabels[i]]):
                speechmodels[j].traindata = np.concatenate((speechmodels[j].traindata , m_trainingsetfeatures[i]))

    for speechmodel in speechmodels:
        speechmodel.model.fit(speechmodel.traindata)

    print ('Training completed -- {0} GMM-HMM models are built for {0} different types of words'.format(len(spoken)))
    print('time elapsed: %.2f seconds' % ( time() - start ))
    print (" "); print(" ")
    
    return speechmodels, gmmhmmindexdict

    '''
    ############################################################################################## 
    # testing
    print("Prediction with training data started")
    m_PredictionlabelList = []

    for i in range(0,len(m_testingsetfeatures)):
        scores = []
        for speechmodel in speechmodels:
             scores.append(speechmodel.model.score(m_testingsetfeatures[i]))
        id  = scores.index(max(scores))
        m_PredictionlabelList.append(speechmodels[id].Class)
        print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

    accuracy = 0.0
    count = 0
    print("")
    print("Prediction for Testing DataSet:")

    for i in range(0,len(m_testingsetlabels)):
        print( "Label"+str(i+1)+":"+m_testingsetlabels[i])
        if gmmhmmindexdict[m_testingsetlabels[i]] == m_PredictionlabelList[i]:
           count = count+1

    accuracy = 100.0*count/float(len(m_testingsetlabels))

    print("")
    print("accuracy ="+str(accuracy))
    print("")

    ############################################################################################## 
    # end of testing
    ############################################################################################## 
    '''#교수님 파일

StartProbPrior=
[1. 0. 0.]
TransMatPrior=
[[0.5 0.5 0. ]
 [0.  0.5 0.5]
 [0.  0.  1. ]]


In [11]:
import numpy as np
import matplotlib.pyplot as plt
#from scikits.talkbox.features import mfcc
#librosa.feature.mfcc(*, y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)[source]
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp
from time import time

warnings.filterwarnings("ignore")

##################################################################################
# folder structure:
#  ${rootpath} / ${speaker_name} / m:0-9 / ${tag}[t:0-${numtrials}]-[m:0-9]
#    m:0-9 model number
#    t:0-{numtrials} trial number
#  example: train_digits('segmented-train', {'gjang', 'do', 'son'}, 'kdigis', 10) 
#           will train with
#    segmented-train/gjang/0/kdigits0-0.wav
#    segmented-train/gjang/0/kdigits1-0.wav
#    ...
#    segmented-train/son/9/kdigits8-9.wav
#    segmented-train/son/9/kdigits9-9.wav
##################################################################################
def validation_digits(speechmodels, gmmhmmindexdict, rootpath, speakers, tag, num_trials=10):    

    ############################################################################################## 
    # 1. find files
    #    for user "gjang", digit 2, recording trial 0 (1st)
    #    "segmented/gjang/2/kdigits0-2.wav"
    # 2. extract MFCC features for training and testing
    #    for each digit, indexes 4 and 9 for test, and the rest for training

    #fpaths = []
    #labels = []
    spoken = []
    m_features = []
    m_labels = []

    count = 0
    for username in speakers:
        apath2 = addpath(rootpath, username)    # example: segmented/gjang
        for ii in range(10):   #dnum in os.listdir(apath2):
            dnum = str(ii)
            apath3 = addpath(apath2, dnum)     # example: segmented/gjang/2
            if dnum not in spoken:
                spoken.append(dnum)
            for trial in range(num_trials):
                file = addpath(apath3,"{}{}-{}.wav".format(tag,trial,dnum))      # segmented/gjang/2/kdigits0-2.wav
                mc = extmfcc(file)

                # display file names for the first 20 files only
                count += 1
                if count <= 20:
                    print(file, dnum, end=' '); print(mc.shape, end=' ')
                elif count == 21:
                    print('...'); print('')

                m_features.append(mc)
                m_labels.append(dnum)

    print('Words spoken:', spoken)
    #print("number of labels and features = %d, %d" % ( len(labels), len(features) ))
    #print("feature shape = ", end='')
    #print(features[0].shape)

    ############################################################################################## 
    print("[validation] number of labels and features = %d, %d" % ( len(m_labels), len(m_features)) )
    print ('Loading data completed')

    ############################################################################################## 
    # testing
    print("Prediction started")
    m_PredictionlabelList = []

    for i in range(0,len(m_features)):
        scores = []
        for speechmodel in speechmodels:
             scores.append(speechmodel.model.score(m_features[i]))
        id  = scores.index(max(scores))
        m_PredictionlabelList.append(speechmodels[id].Class)
        #print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

    accuracy = 0.0
    count = 0
    print("")
    print("Prediction for Testing DataSet:")

    for i in range(0,len(m_labels)):
        #print( "Label"+str(i+1)+":"+m_labels[i])
        if gmmhmmindexdict[m_labels[i]] == m_PredictionlabelList[i]:
           count = count+1

    accuracy = 100.0*count/float(len(m_labels))

    print("")
    print("accuracy ="+str(accuracy))
    print("")

    ############################################################################################## 
    # end of testing 교수님 파일
    ############################################################################################## 

In [12]:
speechmodels, gmmhmmindexdict = train_digits(trainroot, labels_train, 'kdigits', num_trials=10) #Train 시키는것

segmented-train/InkooJeon/0/kdigits0-0.wav 0 (255, 6) segmented-train/InkooJeon/0/kdigits1-0.wav 0 (233, 6) segmented-train/InkooJeon/0/kdigits2-0.wav 0 (265, 6) segmented-train/InkooJeon/0/kdigits3-0.wav 0 (240, 6) segmented-train/InkooJeon/0/kdigits4-0.wav 0 (249, 6) segmented-train/InkooJeon/0/kdigits5-0.wav 0 (263, 6) segmented-train/InkooJeon/0/kdigits6-0.wav 0 (272, 6) segmented-train/InkooJeon/0/kdigits7-0.wav 0 (263, 6) segmented-train/InkooJeon/0/kdigits8-0.wav 0 (254, 6) segmented-train/InkooJeon/0/kdigits9-0.wav 0 (230, 6) segmented-train/InkooJeon/1/kdigits0-1.wav 1 (250, 6) segmented-train/InkooJeon/1/kdigits1-1.wav 1 (242, 6) segmented-train/InkooJeon/1/kdigits2-1.wav 1 (248, 6) segmented-train/InkooJeon/1/kdigits3-1.wav 1 (252, 6) segmented-train/InkooJeon/1/kdigits4-1.wav 1 (228, 6) segmented-train/InkooJeon/1/kdigits5-1.wav 1 (257, 6) segmented-train/InkooJeon/1/kdigits6-1.wav 1 (252, 6) segmented-train/InkooJeon/1/kdigits7-1.wav 1 (260, 6) segmented-train/InkooJeon/1/

In [13]:
validation_digits(speechmodels, gmmhmmindexdict, trainroot, labels_train, 'kdigits', num_trials=10)
validation_digits(speechmodels, gmmhmmindexdict, valclean, labels_val, 'kdigits', num_trials=10)

segmented-train/InkooJeon/0/kdigits0-0.wav 0 (255, 6) segmented-train/InkooJeon/0/kdigits1-0.wav 0 (233, 6) segmented-train/InkooJeon/0/kdigits2-0.wav 0 (265, 6) segmented-train/InkooJeon/0/kdigits3-0.wav 0 (240, 6) segmented-train/InkooJeon/0/kdigits4-0.wav 0 (249, 6) segmented-train/InkooJeon/0/kdigits5-0.wav 0 (263, 6) segmented-train/InkooJeon/0/kdigits6-0.wav 0 (272, 6) segmented-train/InkooJeon/0/kdigits7-0.wav 0 (263, 6) segmented-train/InkooJeon/0/kdigits8-0.wav 0 (254, 6) segmented-train/InkooJeon/0/kdigits9-0.wav 0 (230, 6) segmented-train/InkooJeon/1/kdigits0-1.wav 1 (250, 6) segmented-train/InkooJeon/1/kdigits1-1.wav 1 (242, 6) segmented-train/InkooJeon/1/kdigits2-1.wav 1 (248, 6) segmented-train/InkooJeon/1/kdigits3-1.wav 1 (252, 6) segmented-train/InkooJeon/1/kdigits4-1.wav 1 (228, 6) segmented-train/InkooJeon/1/kdigits5-1.wav 1 (257, 6) segmented-train/InkooJeon/1/kdigits6-1.wav 1 (252, 6) segmented-train/InkooJeon/1/kdigits7-1.wav 1 (260, 6) segmented-train/InkooJeon/1/

---
### noise 추가

In [14]:
audioinputpath = '../audio'
noisefile  = addpath(audioinputpath, 'car.wav')
wnoisefile  = addpath(audioinputpath, 'car_wideband.wav')   # 넓은 주파수 대역에 분포한 잡음

Fs=16000
noise, _ = librosa.load(noisefile, sr=Fs, mono=True)
wnoise, _ = librosa.load(wnoisefile, sr=Fs, mono=True)
# sr: target sampling rate. ‘None’ uses the native sampling rate
# mono = True: convert signal to mono

print(noisefile, noise.shape, noise)
print(wnoisefile, wnoise.shape, wnoise)

Ns = int(Fs*Ts)    # shift number of samples
Nf = int(Fs*Tf)    # frame number of samples
NFFT = int(2**(np.ceil(np.log2(Nf))))   # Nf보다 크거나 같은 2의 거듭제곱을 NFFT 로 정의
hNo = NFFT//2+1
print('Fs = %d, Ns = %d, Nf = %d, NFFT = %d, hNo = %d' % (Fs, Ns, Nf, NFFT, hNo)) #audio/car 파일 삽입

../audio/car.wav (175745,) [-0.01342773 -0.0222168  -0.02905273 ... -0.0390625  -0.03930664
 -0.04086304]
../audio/car_wideband.wav (175745,) [-0.05984497 -0.14807129 -0.14700317 ... -0.10241699 -0.10253906
 -0.09594727]
Fs = 16000, Ns = 160, Nf = 320, NFFT = 512, hNo = 257


__generate noisy speech with various SNRs__
- 음성과 잡음의 상대적 크기에 따라 잡음의 효과를 time domain, spectrogram, 그리고 들어서 확인해 본다.
- mixed input $x[t]$ 를 다음과 같이 생성한다.
$$ x[t] = s[t] + 10^{-r/20} \frac{\sigma_{s}}{\sigma_{n}} n[t] $$

In [15]:
def generate_mixed_signals_2(speech, noise, SNRs, isdraw=False):
    std_s = np.sqrt(np.mean(speech**2))
    std_n = np.sqrt(np.mean(noise[:len(speech)]**2))
    mixedSig = []
    for snr in SNRs:
        gain = np.power(10, -snr/20)
        gn = noise[:len(speech)]/std_n*std_s*gain
        m = speech + gn
        mixedSig.append(m)

    return mixedSig 

In [16]:
audioroot = valroot
audioclean = valclean
labels = labels_val
noisyroots = [addpath(audioroot,'nbnSNR'), addpath(audioroot,'wbnSNR')] 
SNRs = [10, 0, -10] 

for subname in labels:
    num_files = 0
    for w in range(10):
        for trial in range(10):
            basename = '%d/kdigits%d-%d.wav'%(w,trial,w)
            infile = addpath(audioclean, addpath(subname, basename))            
            num_files += 1
            
            signal, Fs = librosa.load(infile, sr=Fs, mono=True)
            nbnsig = generate_mixed_signals_2(signal, noise, SNRs, False)
            wbnsig = generate_mixed_signals_2(signal, wnoise, SNRs, False)
            noisy = [nbnsig, wbnsig] #노이즈 넣는 부분
            
            for jj in range(len(noisy)):
                for n in range(len(noisy[jj])):
                    outfile = addpath('%s%d'%(noisyroots[jj],SNRs[n]), addpath(subname, basename))
                    wav.writewav(outfile, Fs, noisy[jj][n], maxval=1.0) #저장

outputpaths = []
for jj in range(len(noisy)):
    for n in range(len(noisy[jj])):
        outputpaths.append('%s%d'%(noisyroots[jj],SNRs[n])) #노이즈 추가된 파일 경로 가져오기

Noise model test

In [17]:
for path in outputpaths:
    print('--------------------------------')
    print('testing', path)
    validation_digits(speechmodels, gmmhmmindexdict, path, labels, 'kdigits', num_trials=10)

--------------------------------
testing segmented-val/nbnSNR10
segmented-val/nbnSNR10/chlee/0/kdigits0-0.wav 0 (98, 6) segmented-val/nbnSNR10/chlee/0/kdigits1-0.wav 0 (105, 6) segmented-val/nbnSNR10/chlee/0/kdigits2-0.wav 0 (94, 6) segmented-val/nbnSNR10/chlee/0/kdigits3-0.wav 0 (96, 6) segmented-val/nbnSNR10/chlee/0/kdigits4-0.wav 0 (100, 6) segmented-val/nbnSNR10/chlee/0/kdigits5-0.wav 0 (95, 6) segmented-val/nbnSNR10/chlee/0/kdigits6-0.wav 0 (96, 6) segmented-val/nbnSNR10/chlee/0/kdigits7-0.wav 0 (90, 6) segmented-val/nbnSNR10/chlee/0/kdigits8-0.wav 0 (108, 6) segmented-val/nbnSNR10/chlee/0/kdigits9-0.wav 0 (95, 6) segmented-val/nbnSNR10/chlee/1/kdigits0-1.wav 1 (121, 6) segmented-val/nbnSNR10/chlee/1/kdigits1-1.wav 1 (103, 6) segmented-val/nbnSNR10/chlee/1/kdigits2-1.wav 1 (92, 6) segmented-val/nbnSNR10/chlee/1/kdigits3-1.wav 1 (101, 6) segmented-val/nbnSNR10/chlee/1/kdigits4-1.wav 1 (108, 6) segmented-val/nbnSNR10/chlee/1/kdigits5-1.wav 1 (95, 6) segmented-val/nbnSNR10/chlee/1/kd

Words spoken: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
[validation] number of labels and features = 300, 300
Loading data completed
Prediction started

Prediction for Testing DataSet:

accuracy =7.333333333333333



EPD

In [20]:
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
audioroot = 'unsegmented-test'
audioclean = addpath(audioroot,'org')
labels = ['gjang']
tag='kdigits'
num_trials = 10
apath = addpath(audioclean, labels[0])    # ex: unsegmented-test/org/gjang 경로를 지정
apath2 = []
# WAV파일 열기
for i in range(num_trials): 
    apath2.append(addpath(apath, str(i)))   # ex: unsegmented-test/org/gjang/0 - 9까지 리스트 만들기
    if not os.path.exists(apath2[i]):
        os.makedirs(apath2[i])
for j in range(10): 
    file = addpath(apath,"{}{}.wav".format(tag,j)) # ex: unsegmented-test/org/gjang/kdigits0.wav
    # pydub을 사용한 오디오파일 불러오기
    audio = AudioSegment.from_wav(file)
    # 오디오 파일 추출
    nonsilent = detect_nonsilent(audio, min_silence_len=500, silence_thresh=-40)
    # 오디오 추출 결과(음성신호 시작시간, 끝시간)
    time = []
    for i, part in enumerate(nonsilent):
        time.append([part[0]-500,part[1]+500]) # 앞뒤 0.5초 딜레이 추가
    for i in range(10): #unsegmented-test\org\gjang/0~9에 저장
        output_file = addpath(apath2[i],"kdigits{}-{}.wav".format(j,i)) 
        if i==0:
            interval_audio = audio[time[num_trials-1][0]:time[num_trials-1][1]]
            interval_audio.export(output_file, format="wav")
        else:
            interval_audio = audio[time[i-1][0]:time[i-1][1]]
            interval_audio.export(output_file, format="wav")

In [21]:
# 정확도 측정
validation_digits(speechmodels, gmmhmmindexdict, audioclean, labels, 'kdigits', num_trials=10)

unsegmented-test/org/gjang/0/kdigits0-0.wav 0 (124, 6) unsegmented-test/org/gjang/0/kdigits1-0.wav 0 (123, 6) unsegmented-test/org/gjang/0/kdigits2-0.wav 0 (122, 6) unsegmented-test/org/gjang/0/kdigits3-0.wav 0 (122, 6) unsegmented-test/org/gjang/0/kdigits4-0.wav 0 (121, 6) unsegmented-test/org/gjang/0/kdigits5-0.wav 0 (122, 6) unsegmented-test/org/gjang/0/kdigits6-0.wav 0 (122, 6) unsegmented-test/org/gjang/0/kdigits7-0.wav 0 (122, 6) unsegmented-test/org/gjang/0/kdigits8-0.wav 0 (123, 6) unsegmented-test/org/gjang/0/kdigits9-0.wav 0 (124, 6) unsegmented-test/org/gjang/1/kdigits0-1.wav 1 (129, 6) unsegmented-test/org/gjang/1/kdigits1-1.wav 1 (131, 6) unsegmented-test/org/gjang/1/kdigits2-1.wav 1 (129, 6) unsegmented-test/org/gjang/1/kdigits3-1.wav 1 (125, 6) unsegmented-test/org/gjang/1/kdigits4-1.wav 1 (124, 6) unsegmented-test/org/gjang/1/kdigits5-1.wav 1 (125, 6) unsegmented-test/org/gjang/1/kdigits6-1.wav 1 (127, 6) unsegmented-test/org/gjang/1/kdigits7-1.wav 1 (127, 6) unsegmente

In [22]:
##오디오 파일에 노이즈 파일 추가.
##오지현 학생의 코드를 참고하였음.

noisyroots = [addpath(audioroot,'nbnSNR'), addpath(audioroot,'wbnSNR')] # ['unsegmented-test/nbnSNR', 'unsegmented-test/wbnSNR']
SNRs = [10, 0, -10]

num_trials = 10
noisy = [nbnsig, wbnsig]
cnt = 0
path = []
path2 = []

# Open new WAV file for writing
for jj in range(len(noisy)):
    for n in range(len(noisy[jj])):
        path.append(addpath('%s%d'%(noisyroots[jj],SNRs[n]), '%s'%(labels[0])))
        for trial in range(num_trials):
            path2.append(addpath(path[n+3*jj], str(trial)))   # ex: unsegmented-test/nbnSNR10/gjang/0
            if not os.path.exists(path2[trial+10*n+30*jj]):   
                os.makedirs(path2[trial+10*n+30*jj])

for subname in labels:
    num_files = 0
    for w in range(10):
        for trial in range(10):
            basename = '%d/kdigits%d-%d.wav'%(w,trial,w)
            infile = addpath(audioclean, addpath(subname, basename))
            num_files += 1
            
            signal, Fs = librosa.load(infile, sr=Fs, mono=True)
            nbnsig = generate_mixed_signals_2(signal, np.concatenate((noise,noise,noise)), SNRs, False)
            wbnsig = generate_mixed_signals_2(signal, np.concatenate((wnoise,wnoise,wnoise)), SNRs, False)
            noisy = [nbnsig, wbnsig]

            for jj in range(len(noisy)):
                for n in range(len(noisy[jj])):
                    outfile = addpath('%s%d'%(noisyroots[jj],SNRs[n]), addpath(subname, basename))
                    wav.writewav(outfile, Fs, noisy[jj][n], maxval=1.0)
outputpaths = []
for jj in range(len(noisy)):
    for n in range(len(noisy[jj])):
        outputpaths.append('%s%d'%(noisyroots[jj],SNRs[n]))

HMM test

In [23]:
for outpath in outputpaths:
    print('---------------------------------------------------------------------------')
    validation_digits(speechmodels, gmmhmmindexdict, outpath, labels, 'kdigits', num_trials=10)

---------------------------------------------------------------------------
unsegmented-test/nbnSNR10/gjang/0/kdigits0-0.wav 0 (124, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits1-0.wav 0 (123, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits2-0.wav 0 (122, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits3-0.wav 0 (122, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits4-0.wav 0 (121, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits5-0.wav 0 (122, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits6-0.wav 0 (122, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits7-0.wav 0 (122, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits8-0.wav 0 (123, 6) unsegmented-test/nbnSNR10/gjang/0/kdigits9-0.wav 0 (124, 6) unsegmented-test/nbnSNR10/gjang/1/kdigits0-1.wav 1 (129, 6) unsegmented-test/nbnSNR10/gjang/1/kdigits1-1.wav 1 (131, 6) unsegmented-test/nbnSNR10/gjang/1/kdigits2-1.wav 1 (129, 6) unsegmented-test/nbnSNR10/gjang/1/kdigits3-1.wav 1 (125, 6) unsegmented-test/nbnSNR10/gjang/1/kdigits4-1.wav 1 (124, 6) unsegmented-test/nbnSNR1

Words spoken: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
[validation] number of labels and features = 100, 100
Loading data completed
Prediction started

Prediction for Testing DataSet:

accuracy =10.0



Wiener filtering

In [25]:
from scipy.signal import wiener
path_filter = addpath(audioroot, 'Wiener')
if not os.path.exists(path_filter):
    os.makedirs(path_filter)
for i in range(3):
        speech_noise, Fs_noise = librosa.load(addpath(path2[i*10+3], 'kdigits5-3.wav'), sr=None, mono=True)
        filtered_data = wiener(speech_noise)
        outfile = addpath(path_filter, "{}{}-{}_{}{}.wav".format(i,0,3,'nbnSNR',SNRs[i]))
        wav.writewav(outfile, Fs_noise, filtered_data)
        speech_noise, Fs_noise = librosa.load(addpath(path2[i*10+3], 'kdigits5-3.wav'), sr=None, mono=True)
        filtered_data = wiener(speech_noise)
        outfile = addpath(path_filter, "{}{}-{}_{}{}.wav".format(i,0,3,'wbnSNR',SNRs[i-3]))
        wav.writewav(outfile, Fs_noise, filtered_data)             

## End of Project 1